diff --git a/sdc/__init__.py b/sdc/__init__.py index dacf9bfab..db9704985 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -44,6 +44,8 @@ import sdc.datatypes.hpat_pandas_series_rolling_functions import sdc.datatypes.hpat_pandas_stringmethods_functions import sdc.datatypes.hpat_pandas_groupby_functions +import sdc.datatypes.categorical.init +import sdc.datatypes.series.init import sdc.extensions.indexes.range_index_ext diff --git a/sdc/datatypes/categorical/__init__.py b/sdc/datatypes/categorical/__init__.py new file mode 100644 index 000000000..3a1f08efa --- /dev/null +++ b/sdc/datatypes/categorical/__init__.py @@ -0,0 +1,25 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** diff --git a/sdc/datatypes/categorical/boxing.py b/sdc/datatypes/categorical/boxing.py new file mode 100644 index 000000000..81707c264 --- /dev/null +++ b/sdc/datatypes/categorical/boxing.py @@ -0,0 +1,90 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba.extending import box, unbox, NativeValue +from numba.core import boxing +from numba.core.imputils import lower_constant +from numba.np import arrayobj +from numba import types + +from . import pandas_support +from .types import ( + CategoricalDtypeType, + Categorical, +) + + +@box(CategoricalDtypeType) +def box_CategoricalDtype(typ, val, c): + pd_dtype = pandas_support.as_dtype(typ) + return c.pyapi.unserialize(c.pyapi.serialize_object(pd_dtype)) + + +@unbox(CategoricalDtypeType) +def unbox_CategoricalDtype(typ, val, c): + return NativeValue(c.context.get_dummy_value()) + + +@box(Categorical) +def box_Categorical(typ, val, c): + pandas_module_name = c.context.insert_const_string(c.builder.module, "pandas") + pandas_module = c.pyapi.import_module_noblock(pandas_module_name) + + constructor = c.pyapi.object_getattr_string(pandas_module, "Categorical") + + empty_list = c.pyapi.list_new(c.context.get_constant(types.intp, 0)) + args = c.pyapi.tuple_pack([empty_list]) + categorical = c.pyapi.call(constructor, args) + + dtype = box_CategoricalDtype(typ.pd_dtype, val, c) + c.pyapi.object_setattr_string(categorical, "_dtype", dtype) + + codes = boxing.box_array(typ.codes, val, c) + c.pyapi.object_setattr_string(categorical, "_codes", codes) + + c.pyapi.decref(codes) + c.pyapi.decref(dtype) + c.pyapi.decref(args) + c.pyapi.decref(empty_list) + c.pyapi.decref(constructor) + c.pyapi.decref(pandas_module) + return categorical + + +@unbox(Categorical) +def unbox_Categorical(typ, val, c): + codes = c.pyapi.object_getattr_string(val, "codes") + native_value = boxing.unbox_array(typ.codes, codes, c) + c.pyapi.decref(codes) + return native_value + + +@lower_constant(Categorical) +def constant_Categorical(context, builder, ty, pyval): + """ + Create a constant Categorical. + """ + return arrayobj.constant_array(context, builder, ty.codes, pyval.codes) diff --git a/sdc/datatypes/categorical/functions.py b/sdc/datatypes/categorical/functions.py new file mode 100644 index 000000000..af166b085 --- /dev/null +++ b/sdc/datatypes/categorical/functions.py @@ -0,0 +1,38 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from sdc.utilities.utils import sdc_overload_attribute + +from .types import CategoricalDtypeType + + +@sdc_overload_attribute(CategoricalDtypeType, 'ordered') +def pd_CategoricalDtype_categories_overload(self): + ordered = self.ordered + + def impl(self): + return ordered + return impl diff --git a/sdc/datatypes/categorical/init.py b/sdc/datatypes/categorical/init.py new file mode 100644 index 000000000..1461c0105 --- /dev/null +++ b/sdc/datatypes/categorical/init.py @@ -0,0 +1,44 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" +Init Numba extension for Pandas Categorical. +""" + +from . import types +from . import typeof +from . import models +from . import boxing +from . import pdimpl +from . import rewrites +from . import functions + +import numba + + +# register new types in numba.types for using in objmode +setattr(numba.types, "CategoricalDtype", types.CategoricalDtypeType) +setattr(numba.types, "Categorical", types.Categorical) diff --git a/sdc/datatypes/categorical/models.py b/sdc/datatypes/categorical/models.py new file mode 100644 index 000000000..ac586ed51 --- /dev/null +++ b/sdc/datatypes/categorical/models.py @@ -0,0 +1,37 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba.extending import models +from numba.extending import register_model + +from .types import ( + CategoricalDtypeType, + Categorical, +) + + +register_model(CategoricalDtypeType)(models.OpaqueModel) +register_model(Categorical)(models.ArrayModel) diff --git a/sdc/datatypes/categorical/pandas_support.py b/sdc/datatypes/categorical/pandas_support.py new file mode 100644 index 000000000..28c90976c --- /dev/null +++ b/sdc/datatypes/categorical/pandas_support.py @@ -0,0 +1,63 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd + +from numba import types + +from .types import CategoricalDtypeType + + +def from_dtype(pdtype): + """ + Return a Numba Type instance corresponding to the given Pandas *dtype*. + NotImplementedError is raised if unsupported Pandas dtypes. + """ + # TODO: use issubclass + if isinstance(pdtype, pd.CategoricalDtype): + if pdtype.categories is None: + categories = None + else: + categories = list(pdtype.categories) + return CategoricalDtypeType(categories=categories, + ordered=pdtype.ordered) + + raise NotImplementedError("%r cannot be represented as a Numba type" + % (pdtype,)) + + +def as_dtype(nbtype): + """ + Return a Pandas *dtype* instance corresponding to the given Numba type. + NotImplementedError is raised if no correspondence is known. + """ + nbtype = types.unliteral(nbtype) + if isinstance(nbtype, CategoricalDtypeType): + return pd.CategoricalDtype(categories=nbtype.categories, + ordered=nbtype.ordered) + + raise NotImplementedError("%r cannot be represented as a Pandas dtype" + % (nbtype,)) diff --git a/sdc/datatypes/categorical/pdimpl.py b/sdc/datatypes/categorical/pdimpl.py new file mode 100644 index 000000000..5f0f7d7c6 --- /dev/null +++ b/sdc/datatypes/categorical/pdimpl.py @@ -0,0 +1,242 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd + +from numba.extending import overload +from numba.extending import intrinsic +from numba.extending import type_callable +from numba.extending import lower_builtin +from numba import types +from numba import typeof +from numba import objmode + +from .types import ( + CategoricalDtypeType, + Categorical, +) + +from . import pandas_support + + +# Possible alternative implementations: +# 1. @overload + @intrinsic +# 2. @type_callable + @lower_builtin +# They are equivalent. Who is defined firts - has higher priority. + + +def _reconstruct_CategoricalDtype(dtype): + if isinstance(dtype, types.Literal): + return dtype.literal_value + + if isinstance(dtype, CategoricalDtypeType): + return pandas_support.as_dtype(dtype) + + raise NotImplementedError() + + +@overload(pd.CategoricalDtype) +def _CategoricalDtype(categories=None, ordered=None): + """ + Implementation of constructor for pandas CategoricalDtype. + """ + if isinstance(ordered, types.Literal): + ordered_const = ordered.literal_value + else: + ordered_const = ordered + + def impl(categories=None, ordered=None): + return _CategoricalDtype_intrinsic(categories, ordered_const) + return impl + + +@intrinsic +def _CategoricalDtype_intrinsic(typingctx, categories, ordered): + """ + Creates CategoricalDtype object. + + Assertions: + categories - Tuple of literal values or None + ordered - literal Bool + """ + if isinstance(categories, types.NoneType): + categories_list = None + if isinstance(categories, types.Tuple): + categories_list = [c.literal_value for c in categories] + + if isinstance(ordered, types.NoneType): + ordered_value = None + if isinstance(ordered, types.Literal): + ordered_value = ordered.literal_value + + return_type = CategoricalDtypeType(categories_list, ordered_value) + sig = return_type(categories, ordered) + + def codegen(context, builder, signature, args): + # All CategoricalDtype objects are dummy values in LLVM. + # They only exist in the type level. + return context.get_dummy_value() + + return sig, codegen + + +# TODO: move to tools +def is_categoricaldtype(dtype): + if isinstance(dtype, types.Literal) and dtype.literal_value == 'category': + return True + + if isinstance(dtype, CategoricalDtypeType): + return True + + return False + + +# @type_callable(pd.CategoricalDtype) +# def type_CategoricalDtype_constructor(context): +# def typer(categories, ordered): +# # TODO: check all Literal in categories +# if isinstance(categories, types.Tuple) and isinstance(ordered, types.Literal): +# categories_list = [c.literal_value for c in categories] +# return CategoricalDtypeType(categories_list, ordered.literal_value) + +# return typer + + +# @lower_builtin(pd.CategoricalDtype, types.Any, types.Any) +# def _CategoricalDtype_constructor(context, builder, sig, args): +# # All CategoricalDtype objects are dummy values in LLVM. +# # They only exist in the type level. +# return context.get_dummy_value() + + +# @type_callable(pd.CategoricalDtype) +# def type_CategoricalDtype_constructor(context): +# def typer(categories): +# # TODO: check all Literal in categories +# if isinstance(categories, types.Tuple): +# categories_list = [c.literal_value for c in categories] +# return CategoricalDtypeType(categories_list) + +# return typer + + +# @lower_builtin(pd.CategoricalDtype, types.Any) +# def _CategoricalDtype_constructor(context, builder, sig, args): +# # All CategoricalDtype objects are dummy values in LLVM. +# # They only exist in the type level. +# return context.get_dummy_value() + + +# TODO: use dtype too +def _reconstruct_Categorical(values): + values_list = [v.literal_value for v in values] + return pd.Categorical(values=values_list) + + +@overload(pd.Categorical) +def _Categorical(values, categories=None, ordered=None, dtype=None, fastpath=False): + """ + Implementation of constructor for pandas Categorical via objmode. + """ + # TODO: support other parameters (only values now) + + ty = typeof(_reconstruct_Categorical(values)) + + from textwrap import dedent + text = dedent(f""" + def impl(values, categories=None, ordered=None, dtype=None, fastpath=False): + with objmode(categorical="{ty}"): + categorical = pd.Categorical(values, categories, ordered, dtype, fastpath) + return categorical + """) + globals, locals = {'objmode': objmode, 'pd': pd}, {} + exec(text, globals, locals) + impl = locals['impl'] + return impl + + +# @type_callable(pd.Categorical) +# def type_Categorical_constructor(context): +# """ +# Similar to @infer_global(np.array). +# """ +# def typer(values, categories=None, ordered=None, dtype=None, fastpath=False): +# # from numba.core.typing import npydecl +# # codes = npydecl.NpArray(context).generic()(values) +# categorical = _reconstruct_Categorical(values) +# return typeof(categorical) + +# return typer + + +# @lower_builtin(pd.Categorical, types.Any) +# # @lower_builtin(np.Categorical, types.Any, types.DTypeSpec) +# def pd_Categorical(context, builder, sig, args): +# """ +# Similar to @lower_builtin(np.array, ...). +# """ +# from numba.np import arrayobj +# codes = sig.return_type.codes +# return arrayobj.np_array(context, builder, sig.replace(return_type=codes), args) + + +# via intrinsic +# @overload(pd.Categorical) +# def _Categorical(values, categories=None, ordered=None, dtype=None, fastpath=False): +# """ +# Implementation of constructor for pandas Categorical. +# """ +# def impl(values, categories=None, ordered=None, dtype=None, fastpath=False): +# return _Categorical_intrinsic(values, categories, ordered, dtype, fastpath) +# return impl + + +# @intrinsic +# def _Categorical_intrinsic(typingctx, values, categories, ordered, dtype, fastpath): +# """ +# Creates Categorical object. +# """ +# if isinstance(values, types.Tuple): +# values_list = [v.literal_value for v in values] +# categorical = pd.Categorical(values=values_list) +# return_type = typeof(categorical) + +# def codegen(context, builder, signature, args): +# [values] = args +# # TODO: can not recall similar function +# native_value = boxing.unbox_array(typ.codes, codes, c) +# return native_value + +# sig = return_type(values, categories, ordered, dtype, fastpath) +# return sig, codegen + +# # return_type = Categorical(dtype=CategoricalDtypeType(), codes=types.Array(types.int8, 1, 'C')) +# # sig = return_type(values) + +# # def codegen(context, builder, signature, args): +# # return context.get_dummy_value() + +# # return sig, codegen diff --git a/sdc/datatypes/categorical/rewrites.py b/sdc/datatypes/categorical/rewrites.py new file mode 100644 index 000000000..f979238e6 --- /dev/null +++ b/sdc/datatypes/categorical/rewrites.py @@ -0,0 +1,32 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd +from sdc.datatypes.common.rewriteutils import register_tuplify + + +register_tuplify(pd.CategoricalDtype, 'categories') +register_tuplify(pd.Categorical, 'values') diff --git a/sdc/datatypes/categorical/typeof.py b/sdc/datatypes/categorical/typeof.py new file mode 100644 index 000000000..865c79284 --- /dev/null +++ b/sdc/datatypes/categorical/typeof.py @@ -0,0 +1,53 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" +pandas.CategoricalDtype +""" + +import pandas as pd + +from numba.extending import typeof_impl +from numba.np import numpy_support +from numba import typeof + +from . import pandas_support +from .types import Categorical + + +@typeof_impl.register(pd.CategoricalDtype) +def _typeof_CategoricalDtype(val, c): + return pandas_support.from_dtype(val) + + +@typeof_impl.register(pd.Categorical) +def _typeof_Categorical(val, c): + try: + dtype = pandas_support.from_dtype(val.dtype) + except NotImplementedError: + raise ValueError("Unsupported Categorical dtype: %s" % (val.dtype,)) + codes = typeof(val.codes) + return Categorical(dtype=dtype, codes=codes) diff --git a/sdc/datatypes/categorical/types.py b/sdc/datatypes/categorical/types.py new file mode 100644 index 000000000..1b59035ca --- /dev/null +++ b/sdc/datatypes/categorical/types.py @@ -0,0 +1,123 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" +Numba types for support pandas Categorical. +""" + +from numba import types + +import numpy as np + + +__all__ = [ + 'CategoricalDtypeType', + 'Categorical', +] + + +# TODO: consider renaming to CategoricalDtype b/c Categorical - not CategoricalType +class CategoricalDtypeType(types.Opaque): + """ + Numba type for pandas CategoricalDtype. + + Contains: + categories -> array-like + ordered -> bool + """ + def __init__(self, categories=None, ordered=None): + self.categories = categories + self.ordered = ordered + super().__init__(name=self.__repr__()) + + def __repr__(self): + return 'CategoricalDtype(categories={}, ordered={})'.format( + self.categories, self.ordered) + + def __len__(self): + return len(self.categories) if self.categories else 0 + + @property + def dtype(self): + # TODO: take dtype from categories array + return types.int64 + + def int_type(self): + """ + Return minimal int type to fit all categories. + """ + dtype = types.int64 + n_cats = len(self.categories) + if n_cats < np.iinfo(np.int8).max: + dtype = types.int8 + elif n_cats < np.iinfo(np.int16).max: + dtype = types.int16 + elif n_cats < np.iinfo(np.int32).max: + dtype = types.int32 + return dtype + + +# TODO: make ArrayCompatible. It will make reuse Array boxing, unboxing. +class Categorical(types.Type): + """ + Numba type for pandas Categorical. + + Contains: + codes -> array-like + dtype -> CategoricalDtypeType + """ + def __init__(self, dtype, codes=None): + assert(isinstance(dtype, CategoricalDtypeType)) + self.pd_dtype = dtype + self.codes = codes or types.Array(self.pd_dtype.int_type(), ndim=1, layout='C') + # TODO: store dtype for categories values and use it for dtype + super().__init__(name=self.__repr__()) + + def __repr__(self): + def Array__repr__(array): + return "Array({}, {}, {})".format( + self.codes.dtype.__repr__(), + self.codes.ndim.__repr__(), + self.codes.layout.__repr__() + ) + + dtype = self.pd_dtype.__repr__() + codes = Array__repr__(self.codes) + return 'Categorical(dtype={}, codes={})'.format(dtype, codes) + + @property + def categories(self): + return self.pd_dtype.categories + + # Properties for model + + @property + def ndim(self): + return self.codes.ndim + + @property + def dtype(self): + return self.codes.dtype diff --git a/sdc/datatypes/common/__init__.py b/sdc/datatypes/common/__init__.py new file mode 100644 index 000000000..3a1f08efa --- /dev/null +++ b/sdc/datatypes/common/__init__.py @@ -0,0 +1,25 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** diff --git a/sdc/datatypes/common/rewriteutils.py b/sdc/datatypes/common/rewriteutils.py new file mode 100644 index 000000000..6c9010f73 --- /dev/null +++ b/sdc/datatypes/common/rewriteutils.py @@ -0,0 +1,92 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba.core.rewrites import register_rewrite, Rewrite +from numba import errors +from numba.core import ir +from numba.core.ir_utils import guard, get_definition + + +class TuplifyArgs(Rewrite): + """ + Base rewrite calls to *callee*. Replaces *arg* from list and set to tuple. + + Redefine callee and arg in subclass. + """ + + # need to be defined in subclasses + callee = None + arg = None + expr_checker = None + + def match_expr(self, expr, func_ir, block, typemap, calltypes): + """For extended checks in supbclasses.""" + if self.expr_checker: + return self.expr_checker(expr, func_ir, block, typemap, calltypes) + return True + + def match(self, func_ir, block, typemap, calltypes): + self.args = args = [] + self.block = block + for inst in block.find_insts(ir.Assign): + if isinstance(inst.value, ir.Expr) and inst.value.op == 'call': + expr = inst.value + try: + callee = func_ir.infer_constant(expr.func) + except errors.ConstantInferenceError: + continue + if callee is self.callee: + if not self.match_expr(expr, func_ir, block, typemap, calltypes): + continue + + arg_var = None + if len(expr.args): + arg_var = expr.args[0] + elif len(expr.kws) and expr.kws[0][0] == self.arg: + arg_var = expr.kws[0][1] + if arg_var: + arg_var_def = guard(get_definition, func_ir, arg_var) + if arg_var_def and arg_var_def.op in ('build_list', 'build_set'): + args.append(arg_var_def) + return len(args) > 0 + + def apply(self): + """ + Replace list expression with tuple. + """ + block = self.block + for inst in block.body: + if isinstance(inst, ir.Assign) and inst.value in self.args: + inst.value.op = 'build_tuple' + return block + + +def register_tuplify(_callee, _arg, _expr_checker=None): + @register_rewrite('before-inference') + class Tuplifier(TuplifyArgs): + callee = _callee + arg = _arg + expr_checker = _expr_checker diff --git a/sdc/datatypes/hpat_pandas_functions.py b/sdc/datatypes/hpat_pandas_functions.py index 6d70cc108..a82813836 100644 --- a/sdc/datatypes/hpat_pandas_functions.py +++ b/sdc/datatypes/hpat_pandas_functions.py @@ -44,6 +44,7 @@ from sdc.str_arr_ext import string_array_type from sdc.hiframes import join, aggregate, sort +from sdc.types import CategoricalDtypeType, Categorical def get_numba_array_types_for_csv(df): @@ -266,6 +267,7 @@ def sdc_pandas_read_csv( values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values] values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values] values = [string_array_type if v == str else v for v in values] + values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values] dtype = dict(zip(keys, values)) diff --git a/sdc/datatypes/series/__init__.py b/sdc/datatypes/series/__init__.py new file mode 100644 index 000000000..3a1f08efa --- /dev/null +++ b/sdc/datatypes/series/__init__.py @@ -0,0 +1,25 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** diff --git a/sdc/datatypes/series/boxing.py b/sdc/datatypes/series/boxing.py new file mode 100644 index 000000000..51e41d05e --- /dev/null +++ b/sdc/datatypes/series/boxing.py @@ -0,0 +1,59 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba.core.imputils import lower_constant +from numba.core import cgutils + +from .types import SeriesType + + +@lower_constant(SeriesType) +def constant_Series(context, builder, ty, pyval): + """ + Create a constant Series. + + See @unbox(SeriesType) + """ + series = cgutils.create_struct_proxy(ty)(context, builder) + series.data = _constant_Series_data(context, builder, ty, pyval) + # TODO: index and name + return series._getvalue() + + +def _constant_Series_data(context, builder, ty, pyval): + """ + Create a constant for Series data. + + Mostly reuses constant creation for pandas arrays. + """ + + from ..categorical.types import CategoricalDtypeType + + if isinstance(ty.dtype, CategoricalDtypeType): + from ..categorical.boxing import constant_Categorical + return constant_Categorical(context, builder, ty.data, pyval.array) + + raise NotImplementedError() diff --git a/sdc/datatypes/series/init.py b/sdc/datatypes/series/init.py new file mode 100644 index 000000000..c44b21dbf --- /dev/null +++ b/sdc/datatypes/series/init.py @@ -0,0 +1,40 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" +Init Numba extension for Pandas Series. +""" + +from . import types +from . import boxing +from . import pdimpl +from . import rewrites + +import numba + + +# register Series in numba.types for using in objmode +setattr(numba.types, 'series', types.SeriesType) diff --git a/sdc/datatypes/series/pdimpl.py b/sdc/datatypes/series/pdimpl.py new file mode 100644 index 000000000..5788e0b67 --- /dev/null +++ b/sdc/datatypes/series/pdimpl.py @@ -0,0 +1,60 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd + +from numba import typeof +from numba import types +from numba import objmode + +from ..categorical.pdimpl import _reconstruct_CategoricalDtype + + +def _reconstruct_Series(data, dtype): + values_list = [v.literal_value for v in data] + dtype = _reconstruct_CategoricalDtype(dtype) + return pd.Series(data=values_list, dtype=dtype) + + +def _Series_category(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): + """ + Implementation of constructor for pandas Series via objmode. + """ + # TODO: support other parameters (only data and dtype now) + + ty = typeof(_reconstruct_Series(data, dtype)) + + from textwrap import dedent + text = dedent(f""" + def impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): + with objmode(series="{ty}"): + series = pd.Series(data, index, dtype, name, copy, fastpath) + return series + """) + globals, locals = {'objmode': objmode, 'pd': pd}, {} + exec(text, globals, locals) + impl = locals['impl'] + return impl diff --git a/sdc/datatypes/series/rewrites.py b/sdc/datatypes/series/rewrites.py new file mode 100644 index 000000000..dce8cb5cc --- /dev/null +++ b/sdc/datatypes/series/rewrites.py @@ -0,0 +1,56 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd +from sdc.datatypes.common.rewriteutils import register_tuplify + +from numba.core import ir +from numba.core.ir_utils import guard, get_definition + + +def check_dtype_is_categorical(self, expr, func_ir, block, typemap, calltypes): + dtype_var = None + for name, var in expr.kws: + if name == 'dtype': + dtype_var = var + if not dtype_var: + return False + + dtype_var_def = guard(get_definition, func_ir, dtype_var) + is_alias = isinstance(dtype_var_def, ir.Const) and dtype_var_def.value == 'category' + is_categoricaldtype = (hasattr(dtype_var_def, 'func') and + func_ir.infer_constant(dtype_var_def.func) == pd.CategoricalDtype) + if not (is_alias or is_categoricaldtype): + return False + + return True + + +def expr_checker(self, expr, func_ir, block, typemap, calltypes): + return check_dtype_is_categorical(self, expr, func_ir, block, typemap, calltypes) + + +register_tuplify(pd.Series, 'data', expr_checker) diff --git a/sdc/datatypes/series/types.py b/sdc/datatypes/series/types.py new file mode 100644 index 000000000..5c78a58bc --- /dev/null +++ b/sdc/datatypes/series/types.py @@ -0,0 +1,35 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +""" +Numba types for support pandas Series. +""" + +__all__ = [ + 'SeriesType', +] + +from sdc.hiframes.pd_series_type import SeriesType diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index d169f7cb4..8d21402a1 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -43,8 +43,8 @@ from sdc.hiframes.pd_dataframe_type import DataFrameType from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (string_array_type, unbox_str_series, box_str_arr) -from sdc.hiframes.pd_categorical_ext import (PDCategoricalDtype, - box_categorical_array, unbox_categorical_array) +from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical +from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical from sdc.hiframes.pd_series_ext import SeriesType from sdc.hiframes.pd_series_type import _get_series_array_type @@ -153,8 +153,8 @@ def _infer_series_dtype(S): else: raise ValueError( "object dtype infer: data type for column {} not supported".format(S.name)) - elif isinstance(S.dtype, pandas.api.types.CategoricalDtype): - return PDCategoricalDtype(S.dtype.categories) + elif isinstance(S.dtype, pd.CategoricalDtype): + return numba.typeof(S.dtype) # regular numpy types try: return numpy_support.from_dtype(S.dtype) @@ -227,8 +227,8 @@ def box_dataframe(typ, val, c): if dtype == string_type: arr_obj = box_str_arr(arr_typ, arr, c) - elif isinstance(dtype, PDCategoricalDtype): - arr_obj = box_categorical_array(arr_typ, arr, c) + elif isinstance(arr_typ, Categorical): + arr_obj = box_Categorical(arr_typ, arr, c) # context.nrt.incref(builder, arr_typ, arr) elif dtype == types.List(string_type): arr_obj = box_list(list_string_array_type, arr, c) @@ -320,8 +320,8 @@ def _unbox_series_data(dtype, data_typ, arr_obj, c): return unbox_str_series(string_array_type, arr_obj, c) elif data_typ == list_string_array_type: return _unbox_array_list_str(arr_obj, c) - elif isinstance(dtype, PDCategoricalDtype): - return unbox_categorical_array(data_typ, arr_obj, c) + elif isinstance(dtype, CategoricalDtypeType): + return unbox_Categorical(data_typ, arr_obj, c) # TODO: error handling like Numba callwrappers.py return unbox_array(data_typ, arr_obj, c) @@ -373,8 +373,8 @@ def _box_series_data(dtype, data_typ, val, c): if dtype == string_type: arr = box_str_arr(string_array_type, val, c) - elif isinstance(dtype, PDCategoricalDtype): - arr = box_categorical_array(data_typ, val, c) + elif isinstance(dtype, CategoricalDtypeType): + arr = box_Categorical(data_typ, val, c) elif dtype == types.List(string_type): arr = box_list(list_string_array_type, val, c) else: diff --git a/sdc/hiframes/pd_categorical_ext.py b/sdc/hiframes/pd_categorical_ext.py index 7b99d843c..311ed580e 100644 --- a/sdc/hiframes/pd_categorical_ext.py +++ b/sdc/hiframes/pd_categorical_ext.py @@ -49,7 +49,7 @@ def __init__(self, dtype): dtype, 1, 'C', name='CategoricalArray({})'.format(dtype)) -@unbox(CategoricalArray) +# @unbox(CategoricalArray) def unbox_categorical_array(typ, val, c): arr_obj = c.pyapi.object_getattr_string(val, "codes") # c.pyapi.print_object(arr_obj) @@ -71,7 +71,7 @@ def get_categories_int_type(cat_dtype): return dtype -@box(CategoricalArray) +# @box(CategoricalArray) def box_categorical_array(typ, val, c): dtype = typ.dtype mod_name = c.context.insert_const_string(c.builder.module, "pandas") diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 923904b73..7fc552aa4 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -46,6 +46,8 @@ from sdc.str_ext import string_type, list_string_array_type from sdc.hiframes.pd_series_type import SeriesType +from sdc.datatypes.categorical.pdimpl import is_categoricaldtype +from sdc.datatypes.series.pdimpl import _Series_category def is_str_series_typ(t): @@ -109,6 +111,8 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, ----------- - Parameters ``dtype`` and ``copy`` are currently unsupported. - Types iterable and dict as ``data`` parameter are currently unsupported. + - Categorical types (i.e. 'category' and ``CategoricalDtype``) are supported in ``dtype`` + only if they are provided as constants in jitted code. Examples -------- @@ -116,6 +120,11 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, >>> pd.Series([1, 2, 3], ['A', 'B', 'C']) + Create Series with categorical data: + + >>> pd.Series([1, 2, 3], dtype='category') + >>> pd.Series([1, 2, 3], dtype=CategoricalDtype([1, 2, 3])) + .. seealso:: :ref:`DataFrame ` @@ -124,6 +133,9 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, is_index_none = isinstance(index, types.NoneType) or index is None + if is_categoricaldtype(dtype): + return _Series_category(data, index, dtype, name, copy, fastpath) + def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): '''' use binop here as otherwise Numba's dead branch pruning doesn't work diff --git a/sdc/hiframes/pd_series_type.py b/sdc/hiframes/pd_series_type.py index e9f17f5b4..32e004a14 100644 --- a/sdc/hiframes/pd_series_type.py +++ b/sdc/hiframes/pd_series_type.py @@ -36,8 +36,8 @@ from numba.np.arrayobj import make_array, _getitem_array_single_int from sdc.str_ext import string_type, list_string_array_type -from sdc.hiframes.pd_categorical_ext import (PDCategoricalDtype, CategoricalArray) from sdc.str_arr_ext import (string_array_type, iternext_str_array, StringArrayType) +from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical class SeriesType(types.IterableType): @@ -271,8 +271,9 @@ def _get_series_array_type(dtype): return string_array_type # categorical - if isinstance(dtype, PDCategoricalDtype): - return CategoricalArray(dtype) + if isinstance(dtype, CategoricalDtypeType): + # TODO: pass codes array if exists + return Categorical(dtype) # use recarray data layout for series of tuples if isinstance(dtype, types.BaseTuple): diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index 99a902d05..2a569945d 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -56,7 +56,7 @@ import pandas as pd import numpy as np -from sdc.hiframes.pd_categorical_ext import (PDCategoricalDtype, CategoricalArray) +from sdc.types import CategoricalDtypeType, Categorical import pyarrow import pyarrow.csv @@ -302,13 +302,12 @@ def box_stream_reader(typ, val, c): def _get_dtype_str(t): dtype = t.dtype - if isinstance(dtype, PDCategoricalDtype): - cat_arr = CategoricalArray(dtype) - # HACK: add cat type to numba.types - # FIXME: fix after Numba #3372 is resolved - cat_arr_name = 'CategoricalArray' + str(ir_utils.next_label()) - setattr(types, cat_arr_name, cat_arr) - return cat_arr_name + + if isinstance(t, Categorical): + # return categorical representation + # for some reason pandas and pyarrow read_csv() return CategoricalDtype with + # ordered=False in case when dtype is with ordered=None + return str(t).replace('ordered=None', 'ordered=False') if dtype == types.NPDatetime('ns'): dtype = 'NPDatetime("ns")' @@ -322,8 +321,8 @@ def _get_dtype_str(t): def _get_pd_dtype_str(t): dtype = t.dtype - if isinstance(dtype, PDCategoricalDtype): - return 'pd.api.types.CategoricalDtype({})'.format(dtype.categories) + if isinstance(t, Categorical): + return 'pd.{}'.format(t.pd_dtype) if dtype == types.NPDatetime('ns'): dtype = 'str' if t == string_array_type: @@ -430,75 +429,6 @@ def pandas_read_csv( This function has the same interface as pandas.read_csv. """ - # Fallback to pandas - need_categorical = isinstance(dtype, pd.CategoricalDtype) - try: - need_categorical |= any(isinstance(v, pd.CategoricalDtype) for v in dtype.values()) - except: pass - - fallback_to_pandas = need_categorical - - if fallback_to_pandas: - return pd.read_csv( - filepath_or_buffer=filepath_or_buffer, - sep=sep, - delimiter=delimiter, - # Column and Index Locations and Names - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - prefix=prefix, - mangle_dupe_cols=mangle_dupe_cols, - # General Parsing Configuration - dtype=dtype, - engine=engine, - converters=converters, - true_values=true_values, - false_values=false_values, - skipinitialspace=skipinitialspace, - skiprows=skiprows, - skipfooter=skipfooter, - nrows=nrows, - # NA and Missing Data Handling - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - verbose=verbose, - skip_blank_lines=skip_blank_lines, - # Datetime Handling - parse_dates=parse_dates, - infer_datetime_format=infer_datetime_format, - keep_date_col=keep_date_col, - date_parser=date_parser, - dayfirst=dayfirst, - cache_dates=cache_dates, - # Iteration - iterator=iterator, - chunksize=chunksize, - # Quoting, Compression, and File Format - compression=compression, - thousands=thousands, - decimal=decimal, - lineterminator=lineterminator, - quotechar=quotechar, - # quoting=csv.QUOTE_MINIMAL, # not supported - doublequote=doublequote, - escapechar=escapechar, - comment=comment, - encoding=encoding, - dialect=dialect, - # Error Handling - error_bad_lines=error_bad_lines, - warn_bad_lines=warn_bad_lines, - # Internal - delim_whitespace=delim_whitespace, - # low_memory=_c_parser_defaults["low_memory"], # not supported - memory_map=memory_map, - float_precision=float_precision, - ) - if delimiter is None: delimiter = sep @@ -506,8 +436,6 @@ def pandas_read_csv( include_columns = None - # categories = None - if usecols: if type(usecols[0]) == str: if names: @@ -529,30 +457,43 @@ def pandas_read_csv( # categories = [f"f{names_list.index(k)}" for k in keys] # except: pass + categories = [] + if dtype: if names: names_list = list(names) if isinstance(dtype, dict): - dtype = {f"f{names_list.index(k)}": pyarrow.from_numpy_dtype(v) for k, v in dtype.items()} + column_types = {} + for k, v in dtype.items(): + column_name = "f{}".format(names_list.index(k)) + if isinstance(v, pd.CategoricalDtype): + categories.append(column_name) + column_type = pyarrow.string() + else: + column_type = pyarrow.from_numpy_dtype(v) + column_types[column_name] = column_type else: - dtype = {f"f{names_list.index(k)}": pyarrow.from_numpy_dtype(dtype) for k in names} + pa_dtype = pyarrow.from_numpy_dtype(dtype) + column_types = {f"f{names_list.index(k)}": pa_dtype for k in names} elif usecols: if isinstance(dtype, dict): - dtype = {k: pyarrow.from_numpy_dtype(v) for k, v in dtype.items()} + column_types = {k: pyarrow.from_numpy_dtype(v) for k, v in dtype.items()} else: - dtype = {k: pyarrow.from_numpy_dtype(dtype) for k in usecols} + column_types = {k: pyarrow.from_numpy_dtype(dtype) for k in usecols} else: if isinstance(dtype, dict): - dtype = {k: pyarrow.from_numpy_dtype(v) for k, v in dtype.items()} + column_types = {k: pyarrow.from_numpy_dtype(v) for k, v in dtype.items()} else: - dtype = pyarrow.from_numpy_dtype(dtype) + column_types = pyarrow.from_numpy_dtype(dtype) + else: + column_types = None try: for column in parse_dates: name = f"f{column}" # TODO: Try to help pyarrow infer date type - set DateType. # dtype[name] = pyarrow.from_numpy_dtype(np.datetime64) # string - del dtype[name] + del column_types[name] except: pass parse_options = pyarrow.csv.ParseOptions( @@ -566,7 +507,7 @@ def pandas_read_csv( ) convert_options = pyarrow.csv.ConvertOptions( - column_types=dtype, + column_types=column_types, strings_can_be_null=True, include_columns=include_columns, ) @@ -579,7 +520,7 @@ def pandas_read_csv( ) dataframe = table.to_pandas( - # categories=categories, + # categories=categories or None, ) if names: @@ -591,6 +532,12 @@ def pandas_read_csv( else: dataframe.columns = names + # fix when PyArrow will support predicted categories + if isinstance(dtype, dict): + for column_name, column_type in dtype.items(): + if isinstance(column_type, pd.CategoricalDtype): + dataframe[column_name] = dataframe[column_name].astype(column_type) + return dataframe @@ -605,11 +552,11 @@ def _gen_csv_reader_py_pyarrow_func_text_core(col_names, col_typs, dtype_present date_inds = ", ".join(str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names nb_objmode_vars = ", ".join([ - "{}='{}'".format(to_varname(cname), _get_dtype_str(t)) + '{}="{}"'.format(to_varname(cname), _get_dtype_str(t)) for cname, t in zip(return_columns, col_typs) ]) pd_dtype_strs = ", ".join([ - "'{}':{}".format(cname, _get_pd_dtype_str(t)) + "'{}': {}".format(cname, _get_pd_dtype_str(t)) for cname, t in zip(return_columns, col_typs) ]) @@ -670,15 +617,10 @@ def _gen_csv_reader_py_pyarrow_func_text_dataframe(col_names, col_typs, dtype_pr def _gen_csv_reader_py_pyarrow_py_func(func_text, func_name): - # print(func_text) - glbls = globals() # TODO: fix globals after Numba's #3355 is resolved - # {'objmode': objmode, 'csv_file_chunk_reader': csv_file_chunk_reader, - # 'pd': pd, 'np': np} - loc_vars = {} - exec(func_text, glbls, loc_vars) - csv_reader_py = loc_vars[func_name] - - return csv_reader_py + locals = {} + exec(func_text, globals(), locals) + func = locals[func_name] + return func def _gen_csv_reader_py_pyarrow_jit_func(csv_reader_py): diff --git a/sdc/rewrites/read_csv_consts.py b/sdc/rewrites/read_csv_consts.py index 131988bb6..69b09aff1 100644 --- a/sdc/rewrites/read_csv_consts.py +++ b/sdc/rewrites/read_csv_consts.py @@ -25,127 +25,57 @@ # ***************************************************************************** from numba.core.rewrites import register_rewrite, Rewrite -from numba.core.ir_utils import find_callname, guard, mk_unique_var +from numba.core.ir_utils import guard, get_definition from numba import errors from numba.core import ir -from numba.core import consts -from sdc.rewrites.ir_utils import remove_unused_recursively, make_assign, find_operations +from sdc.rewrites.ir_utils import find_operations - -def find_build_sequence(func_ir, var): - """Reimplemented from numba.core.ir_utils.find_build_sequence - Added 'build_map' to build_ops list. - """ - from numba.core.ir_utils import (require, get_definition) - - require(isinstance(var, ir.Var)) - var_def = get_definition(func_ir, var) - require(isinstance(var_def, ir.Expr)) - build_ops = ['build_tuple', 'build_list', 'build_set', 'build_map'] - require(var_def.op in build_ops) - return var_def.items, var_def.op - - -class ConstantInference(consts.ConstantInference): - - def _infer_expr(self, expr): - if expr.op == 'build_map': - def inf_const(value): - return self.infer_constant(value.name, loc=expr.loc) - return {inf_const(k): inf_const(v) for k, v in expr.items} - return super()._infer_expr(expr) +import pandas as pd @register_rewrite('before-inference') class RewriteReadCsv(Rewrite): """ - Searches for calls of pandas.read_csv() and replace it with calls of read_csv. + Searches for calls to Pandas read_csv() and replace its arguments with tuples. """ - _pandas_read_csv_calls = [ - ('read_csv', 'pandas'), # for calls like pandas.read_csv() - ('read_csv', 'pandas.io.parsers'), # for calls like read_csv = pandas.read_csv, read_csv() - ] - _read_csv_const_args = ('names', 'dtype', 'usecols') def match(self, func_ir, block, typemap, calltypes): - # TODO: 1. save instructions of build_map, build_list for read_csv params - # 2. check that vars are used only in read_csv - # 3. replace vars with build_tuple inplace + # TODO: check that vars are used only in read_csv - self.func_ir = func_ir self.block = block - self.consts = consts = {} + self.args = args = [] # Find all assignments with a right-hand read_csv() call for inst in find_operations(block=block, op_name='call'): expr = inst.value - call = guard(find_callname, func_ir, expr) - if call not in self._pandas_read_csv_calls: + try: + callee = func_ir.infer_constant(expr.func) + except errors.ConstantInferenceError: continue - # collect constant parameters with type list and dict + if callee is not pd.read_csv: + continue + # collect arguments with list, set and dict # in order to replace with tuple for key, var in expr.kws: - if key not in self._read_csv_const_args: - continue - try: - const = func_ir.infer_constant(var) - except errors.ConstantInferenceError: - try: - const = ConstantInference(func_ir).infer_constant(var.name) - except errors.ConstantInferenceError: - continue - if isinstance(const, (list, dict)): - consts.setdefault(inst, {})[key] = const + if key in self._read_csv_const_args: + arg_def = guard(get_definition, func_ir, var) + ops = ['build_list', 'build_set', 'build_map'] + if arg_def.op in ops: + args.append(arg_def) - return len(consts) > 0 + return len(args) > 0 def apply(self): - new_block = self.block.copy() - new_block.clear() - vars_to_remove = [] - - for inst in self.block.body: - if inst in self.consts: - consts = self.consts[inst] - - for key, value in consts.items(): - if key not in dict(inst.value.kws): - continue - - # collecting data from current variable - current_var = [var for name, var in inst.value.kws if name == key][0] - loc = current_var.loc - - seq, _ = guard(find_build_sequence, self.func_ir, current_var) - if not seq: - continue - if isinstance(value, list): - items = seq - elif isinstance(value, dict): - items = sum(map(list, seq), []) - else: - continue - - # create tuple variable - stmt = make_assign(ir.Expr.build_tuple(items=items, loc=loc), new_block.scope, - self.func_ir, loc, name=f"{key}_tuple") - new_block.append(stmt) - - # replace variable in call - inst.value.kws = [(kw[0], stmt.target) if kw[0] == key else kw for kw in inst.value.kws] - - # save old variable for removing - vars_to_remove.append(current_var) - - new_block.append(inst) - - # remove old variables - for var in vars_to_remove: - # unsused variables are removed after new block is created b/c - # remove_unused_recursively should see all del statements of variables - remove_unused_recursively(var, new_block, self.func_ir) - - return new_block + """ + Replace list, set and dict expressions with tuple. + """ + block = self.block + for inst in block.body: + if isinstance(inst, ir.Assign) and inst.value in self.args: + if inst.value.op == 'build_map': + inst.value.items = sum(map(list, inst.value.items), []) + inst.value.op = 'build_tuple' + return block diff --git a/sdc/tests/__init__.py b/sdc/tests/__init__.py index 137af26f8..c36067f0f 100644 --- a/sdc/tests/__init__.py +++ b/sdc/tests/__init__.py @@ -29,6 +29,7 @@ from sdc.tests.test_series import * from sdc.tests.test_dataframe import * from sdc.tests.test_hiframes import * +from .categorical import * # from sdc.tests.test_d4p import * from sdc.tests.test_date import * diff --git a/sdc/tests/categorical/__init__.py b/sdc/tests/categorical/__init__.py new file mode 100644 index 000000000..3bafd7472 --- /dev/null +++ b/sdc/tests/categorical/__init__.py @@ -0,0 +1,29 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from . import test_categorical +from . import test_categoricaldtype +from . import test_series_category diff --git a/sdc/tests/categorical/test_categorical.py b/sdc/tests/categorical/test_categorical.py new file mode 100644 index 000000000..0f55dc949 --- /dev/null +++ b/sdc/tests/categorical/test_categorical.py @@ -0,0 +1,93 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from sdc.tests.test_base import TestCase + +import pandas as pd +import numba as nb +from numba import types + +from sdc.datatypes.categorical.types import ( + Categorical, + CategoricalDtypeType, +) + + +class CategoricalTest(TestCase): + + def _pd_value(self): + return pd.Categorical(values=[1, 2, 3, 2, 1]) + + def test_typeof(self): + pd_value = self._pd_value() + nb_type = nb.typeof(pd_value) + + assert(isinstance(nb_type, Categorical)) + assert(nb_type.pd_dtype == CategoricalDtypeType(categories=[1, 2, 3], ordered=False)) + assert(nb_type.codes == types.Array(dtype=types.int8, ndim=1, layout='C', readonly=True)) + + def test_unboxing(self): + @nb.njit + def func(c): + pass + + pd_value = self._pd_value() + func(pd_value) + + def test_boxing(self): + @nb.njit + def func(c): + return c + + pd_value = self._pd_value() + boxed = func(pd_value) + assert(boxed.equals(pd_value)) + + def test_lowering(self): + pd_value = self._pd_value() + + @nb.njit + def func(): + return pd_value + + boxed = func() + assert(boxed.equals(pd_value)) + + def test_constructor(self): + @nb.njit + def func(): + return pd.Categorical(values=(1, 2, 3, 2, 1)) + + boxed = func() + assert(boxed.equals(self._pd_value())) + + def test_constructor_values_list(self): + @nb.njit + def func(): + return pd.Categorical(values=[1, 2, 3, 2, 1]) + + boxed = func() + assert(boxed.equals(self._pd_value())) diff --git a/sdc/tests/categorical/test_categoricaldtype.py b/sdc/tests/categorical/test_categoricaldtype.py new file mode 100644 index 000000000..4d7030bb9 --- /dev/null +++ b/sdc/tests/categorical/test_categoricaldtype.py @@ -0,0 +1,130 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from sdc.tests.test_base import TestCase + +import pandas as pd +import numba as nb + +from sdc.datatypes.categorical.types import CategoricalDtypeType + + +class CategoricalDtypeTest(TestCase): + + def _pd_dtype(self, ordered=True): + return pd.CategoricalDtype(categories=['b', 'a'], ordered=ordered) + + def test_typeof(self): + pd_dtype = self._pd_dtype() + nb_dtype = nb.typeof(pd_dtype) + + assert(isinstance(nb_dtype, CategoricalDtypeType)) + assert(nb_dtype.categories == list(pd_dtype.categories)) + assert(nb_dtype.ordered == pd_dtype.ordered) + + def test_unboxing(self): + @nb.njit + def func(c): + pass + + pd_dtype = self._pd_dtype() + func(pd_dtype) + + def test_boxing(self): + @nb.njit + def func(c): + return c + + pd_dtype = self._pd_dtype() + boxed = func(pd_dtype) + assert(boxed == pd_dtype) + + def test_lowering(self): + pd_dtype = self._pd_dtype() + + @nb.njit + def func(): + return pd_dtype + + boxed = func() + assert(boxed == pd_dtype) + + def test_constructor(self): + @nb.njit + def func(): + return pd.CategoricalDtype(categories=('b', 'a'), ordered=True) + + boxed = func() + assert(boxed == self._pd_dtype()) + + def test_constructor_categories_list(self): + @nb.njit + def func(): + return pd.CategoricalDtype(categories=['b', 'a'], ordered=True) + + boxed = func() + assert(boxed == self._pd_dtype()) + + def test_constructor_categories_set(self): + @nb.njit + def func(): + return pd.CategoricalDtype(categories={'b', 'a'}, ordered=True) + + boxed = func() + assert(boxed == self._pd_dtype()) + + def test_constructor_no_order(self): + @nb.njit + def func(): + return pd.CategoricalDtype(categories=('b', 'a')) + + boxed = func() + assert(boxed == self._pd_dtype(ordered=False)) + + def test_constructor_no_categories(self): + @nb.njit + def func(): + return pd.CategoricalDtype() + + boxed = func() + expected = pd.CategoricalDtype(ordered=None) + assert(boxed == expected) + assert(boxed.categories == expected.categories) + assert(boxed.ordered == expected.ordered) + + def test_attribute_ordered(self): + @nb.njit + def func(c): + return c.ordered + + pd_dtype = self._pd_dtype() + ordered = func(pd_dtype) + assert(ordered == pd_dtype.ordered) + + +if __name__ == "__main__": + import unittest + unittest.main() diff --git a/sdc/tests/categorical/test_series_category.py b/sdc/tests/categorical/test_series_category.py new file mode 100644 index 000000000..9e6d69643 --- /dev/null +++ b/sdc/tests/categorical/test_series_category.py @@ -0,0 +1,114 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from sdc.tests.test_base import TestCase + +import pandas as pd +import numba as nb +from numba import types + +from sdc.types import ( + SeriesType, + CategoricalDtypeType, + Categorical, +) + + +class SeriesCategoryTest(TestCase): + """ + Test for pandas Series with CategoricalDtype. + """ + + def _pd_value(self): + return pd.Series(data=[1, 2, 3, 2, 1], dtype='category') + + def test_typeof(self): + pd_value = self._pd_value() + nb_type = nb.typeof(pd_value) + + assert(isinstance(nb_type, SeriesType)) + assert(nb_type.dtype == CategoricalDtypeType(categories=[1, 2, 3], ordered=False)) + assert(nb_type.index == types.none) + assert(nb_type.data == Categorical(CategoricalDtypeType(categories=[1, 2, 3], ordered=False))) + + def test_unboxing(self): + @nb.njit + def func(c): + pass + + pd_value = self._pd_value() + func(pd_value) + + def test_boxing(self): + @nb.njit + def func(c): + return c + + pd_value = self._pd_value() + boxed = func(pd_value) + assert(boxed.equals(pd_value)) + + def test_lowering(self): + pd_value = self._pd_value() + + @nb.njit + def func(): + return pd_value + + boxed = func() + assert(boxed.equals(pd_value)) + + def test_constructor(self): + @nb.njit + def func(): + return pd.Series(data=(1, 2, 3, 2, 1), dtype='category') + + boxed = func() + assert(boxed.equals(self._pd_value())) + + def test_constructor_list(self): + @nb.njit + def func(): + return pd.Series(data=[1, 2, 3, 2, 1], dtype='category') + + boxed = func() + assert(boxed.equals(self._pd_value())) + + def test_constructor_CategoricalDtype(self): + @nb.njit + def func(): + return pd.Series(data=(1, 2, 3, 2, 1), dtype=pd.CategoricalDtype(categories=(1, 2, 3))) + + boxed = func() + assert(boxed.equals(self._pd_value())) + + def test_constructor_CategoricalDtype_list(self): + @nb.njit + def func(): + return pd.Series(data=[1, 2, 3, 2, 1], dtype=pd.CategoricalDtype(categories=[1, 2, 3])) + + boxed = func() + assert(boxed.equals(self._pd_value())) diff --git a/sdc/tests/test_io.py b/sdc/tests/test_io.py index d65fe8a0e..6eb1ee58a 100644 --- a/sdc/tests/test_io.py +++ b/sdc/tests/test_io.py @@ -32,7 +32,7 @@ import unittest import numba from numba.core.config import IS_32BITS -from pandas.api.types import CategoricalDtype +from pandas import CategoricalDtype import sdc from sdc.io.csv_ext import pandas_read_csv as pd_read_csv @@ -597,7 +597,6 @@ def test_impl(): return test_impl - @skip_numba_jit def test_csv_str1(self): test_impl = self.pd_csv_str1() hpat_func = self.jit(test_impl) @@ -659,24 +658,18 @@ def pd_csv_cat1(self, use_pyarrow=False): read_csv = self._read_csv(use_pyarrow) def test_impl(): - # names = ['C1', 'C2', 'C3'] + names = ['C1', 'C2', 'C3'] ct_dtype = CategoricalDtype(['A', 'B', 'C']) dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str} - df = read_csv("csv_data_cat1.csv", - # names=names, # Error: names should be constant list - names=['C1', 'C2', 'C3'], - dtype=dtypes - ) - return df.C2 + df = read_csv("csv_data_cat1.csv", names=names, dtype=dtypes) + return df return test_impl - @skip_numba_jit def test_csv_cat1(self): test_impl = self.pd_csv_cat1() hpat_func = self.jit(test_impl) - pd.testing.assert_series_equal( - hpat_func(), test_impl(), check_names=False) + pd.testing.assert_frame_equal(hpat_func(), test_impl(), check_names=False) def pd_csv_cat2(self, use_pyarrow=False): read_csv = self._read_csv(use_pyarrow) @@ -692,7 +685,6 @@ def test_impl(): return test_impl - @skip_numba_jit def test_csv_cat2(self): test_impl = self.pd_csv_cat2() hpat_func = self.jit(test_impl) diff --git a/sdc/types.py b/sdc/types.py new file mode 100644 index 000000000..7683ad3c3 --- /dev/null +++ b/sdc/types.py @@ -0,0 +1,28 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from .datatypes.categorical.types import * +from .datatypes.series.types import *