From d32e9ecf480ef1ae114113df89112f8a460b7423 Mon Sep 17 00:00:00 2001 From: Vyacheslav-Smirnov <51660067+Vyacheslav-Smirnov@users.noreply.github.com> Date: Fri, 5 Feb 2021 12:16:40 +0300 Subject: [PATCH 1/4] Update wheels build + Update linux build tag for wheels (#954) * Update wheels build + Update linux build tag for wheels * Put numba back to build deps --- conda-recipe/bld.bat | 8 ++++++++ conda-recipe/build.sh | 18 +++++++++++++++++- conda-recipe/meta.yaml | 21 +-------------------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat index 1709a670a..fdd3a9cd4 100644 --- a/conda-recipe/bld.bat +++ b/conda-recipe/bld.bat @@ -2,3 +2,11 @@ echo on "%PYTHON%" setup.py build install --single-version-externally-managed --record=record.txt if errorlevel 1 exit 1 + +rem Build wheel package +if NOT "%WHEELS_OUTPUT_FOLDER%"=="" ( + %PYTHON% setup.py bdist_wheel + if errorlevel 1 exit 1 + copy dist\sdc*.whl %WHEELS_OUTPUT_FOLDER% + if errorlevel 1 exit 1 +) diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh index 36247441a..d38205712 100644 --- a/conda-recipe/build.sh +++ b/conda-recipe/build.sh @@ -1,4 +1,20 @@ set -ex -MACOSX_DEPLOYMENT_TARGET=10.9 \ +if [ `uname` == Darwin ]; then + WHEELS_BUILD_ARGS="" + export MACOSX_DEPLOYMENT_TARGET=10.9 +else + if [ "$CONDA_PY" == "36" ]; then + WHEELS_BUILD_ARGS="-p manylinux1_x86_64" + else + WHEELS_BUILD_ARGS="-p manylinux2014_x86_64" + fi +fi + $PYTHON setup.py build install --single-version-externally-managed --record=record.txt + +# Build wheel package +if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then + $PYTHON setup.py bdist_wheel ${WHEELS_BUILD_ARGS} + cp dist/sdc*.whl ${WHEELS_OUTPUT_FOLDER} +fi diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 40868e5e2..ed1084d14 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -13,13 +13,12 @@ build: number: {{ GIT_DESCRIBE_NUMBER|int }} script_env: - SDC_CONFIG_PIPELINE_SDC + - WHEELS_OUTPUT_FOLDER requirements: build: - {{ compiler('c') }} # [not osx] - {{ compiler('cxx') }} # [not osx] - - wheel - - python - numba {{ NUMBA_VERSION }} host: @@ -45,24 +44,6 @@ test: imports: - sdc -outputs: - - type: conda - name: sdc - - type: wheel - name: sdc - requirements: - build: - - {{ compiler('c') }} # [not osx] - - {{ compiler('cxx') }} # [not osx] - - python - - wheel - - setuptools - - numba {{ NUMBA_VERSION }} - - numpy - - pandas {{ PANDAS_VERSION }} - - pyarrow {{ PYARROW_VERSION }} - - tbb-devel - about: home: https://github.com/IntelPython/sdc license: BSD-2-Clause From 5ce38417dc3c7ba53a65b093d96888c1ec7008ca Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Fri, 19 Feb 2021 17:32:31 +0300 Subject: [PATCH 2/4] Merge from master 2021_w8 (#962) * Adds Int64Index type and updates Series and DF methods to use it (#950) * Adds Int64Index type and updates Series and DF methods to use it Motivation: as part of the work on supporting common pandas indexes a new type (Int64IndexType) representing pandas.Int64Index is added. Boxing/unboxing of Series and DataFrames as well as common numpy-like functions are changed accordingly to handle it. * Fixing DateTime tests and PEP remarks * Fixing review comments #1 * Move to Numba 0.52 (#939) * Taking numba from master * Moving to Numba 0.52 commit 3182540b127268ace11cf4042cd87f044875d9fa Author: Kozlov, Alexey Date: Wed Oct 21 19:49:58 2020 +0300 Cleaning up before squash commit 895668116542fe3057f73fcb276c441cbde66747 Author: Kozlov, Alexey Date: Tue Oct 13 17:31:34 2020 +0300 Workaround for set from str_arr problem * Fixing correct NUMBA_VERSION * Remove intel/label/beta channel from Azure CI builds * Move to pandas=1.2.0 (#959) * Move to pandas=1.2.0 Motivation: use latest versions of dependencies. * More failed tests are fixed * Fixing doc build * Fixing bug in stability of mergesort impl for StringArray (#961) Motivation: for StringArray type legacy implementation of stable sort computed result when sorting with ascending=False by reversing the result of argsorting with ascending=True, which produces wrong order in groups of elements with the same value. Implemented solution adds new function argument 'ascening' and uses it when calling native function impl via serial stable_sort. --- README.rst | 8 +- buildscripts/utilities.py | 2 +- conda-recipe/meta.yaml | 4 +- .../_api_ref.pandas.window_templ.rst | 10 +- docs/source/getting_started.rst | 4 +- requirements.txt | 4 +- sdc/__init__.py | 3 +- sdc/_str_ext.cpp | 34 + sdc/datatypes/common_functions.py | 101 +-- .../hpat_pandas_dataframe_functions.py | 3 +- sdc/datatypes/hpat_pandas_series_functions.py | 31 +- sdc/datatypes/int64_index_type.py | 65 ++ sdc/extensions/indexes/indexes_generic.py | 40 ++ sdc/extensions/indexes/int64_index_ext.py | 415 +++++++++++++ sdc/extensions/indexes/range_index_ext.py | 59 +- sdc/functions/numpy_like.py | 56 +- sdc/functions/sort.py | 20 +- sdc/hiframes/api.py | 11 +- sdc/hiframes/boxing.py | 23 +- sdc/hiframes/pd_series_ext.py | 1 - sdc/native/module.cpp | 44 +- sdc/native/sort.cpp | 12 +- sdc/native/stable_sort.cpp | 14 +- sdc/native/utils.cpp | 12 + sdc/native/utils.hpp | 21 + sdc/set_ext.py | 1 + sdc/str_arr_ext.py | 83 ++- sdc/tests/__init__.py | 2 +- sdc/tests/indexes/__init__.py | 29 + sdc/tests/indexes/index_datagens.py | 88 +++ sdc/tests/indexes/test_indexes.py | 266 ++++++++ sdc/tests/indexes/test_int64_index.py | 583 ++++++++++++++++++ .../test_range_index.py} | 258 ++------ sdc/tests/test_dataframe.py | 24 + sdc/tests/test_date.py | 4 +- sdc/tests/test_hpat_jit.py | 1 + sdc/tests/test_rolling.py | 1 - sdc/tests/test_sdc_numpy.py | 39 ++ sdc/tests/test_series.py | 72 ++- sdc/utilities/sdc_typing_utils.py | 18 +- setup.py | 4 +- 41 files changed, 2021 insertions(+), 449 deletions(-) create mode 100644 sdc/datatypes/int64_index_type.py create mode 100644 sdc/extensions/indexes/indexes_generic.py create mode 100644 sdc/extensions/indexes/int64_index_ext.py create mode 100644 sdc/tests/indexes/__init__.py create mode 100644 sdc/tests/indexes/index_datagens.py create mode 100644 sdc/tests/indexes/test_indexes.py create mode 100644 sdc/tests/indexes/test_int64_index.py rename sdc/tests/{test_indexes.py => indexes/test_range_index.py} (74%) diff --git a/README.rst b/README.rst index de0dd0e90..a5f18b8a9 100644 --- a/README.rst +++ b/README.rst @@ -34,13 +34,13 @@ Distribution includes Intel® SDC for Python 3.6 and Python 3.7 for Windows and Intel® SDC conda package can be installed using the steps below:: - > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.0.5 -c anaconda -c conda-forge + > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc-env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel® SDC wheel package can be installed using the steps below:: - > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.0.5 -c anaconda -c conda-forge + > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc-env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc @@ -82,7 +82,7 @@ Building on Linux with setuptools export PYVER=<3.6 or 3.7> export NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.49 pandas=1.0.5 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64 + conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64 source activate sdc-env git clone https://github.com/IntelPython/sdc.git cd sdc @@ -120,7 +120,7 @@ Building on Windows with setuptools set PYVER=<3.6 or 3.7> set NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.49 pandas=1.0.5 pyarrow=0.17.0 + conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 conda activate sdc-env set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include set LIB=%LIB%;%CONDA_PREFIX%\Library\lib diff --git a/buildscripts/utilities.py b/buildscripts/utilities.py index a1e1c0a90..440c64ec0 100644 --- a/buildscripts/utilities.py +++ b/buildscripts/utilities.py @@ -52,7 +52,7 @@ def __init__(self, python, sdc_local_channel=None): self.line_single = '-'*80 # Set channels - self.channel_list = ['-c', 'intel/label/beta', '-c', 'defaults', '-c', 'conda-forge'] + self.channel_list = ['-c', 'defaults', '-c', 'conda-forge'] if sdc_local_channel: sdc_local_channel = Path(sdc_local_channel).resolve().as_uri() self.channel_list = ['-c', sdc_local_channel] + self.channel_list diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index ed1084d14..bd95dbc9d 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ -{% set NUMBA_VERSION = "==0.51.2" %} -{% set PANDAS_VERSION = "==1.0.5" %} +{% set NUMBA_VERSION = "==0.52.0" %} +{% set PANDAS_VERSION = "==1.2.0" %} {% set PYARROW_VERSION = "==0.17.0" %} package: diff --git a/docs/source/_templates/_api_ref.pandas.window_templ.rst b/docs/source/_templates/_api_ref.pandas.window_templ.rst index fbf6419cd..c4308cb9d 100644 --- a/docs/source/_templates/_api_ref.pandas.window_templ.rst +++ b/docs/source/_templates/_api_ref.pandas.window_templ.rst @@ -51,8 +51,8 @@ Exponentially-weighted moving window functions ---------------------------------------------- .. sdc_toctree - EWM.mean - EWM.std - EWM.var - EWM.corr - EWM.cov + ewm.ExponentialMovingWindow.mean + ewm.ExponentialMovingWindow.std + ewm.ExponentialMovingWindow.var + ewm.ExponentialMovingWindow.corr + ewm.ExponentialMovingWindow.cov diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index a8def2dd0..b0fcc0182 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla Intel SDC conda package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=0.25.3 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc_env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel SDC wheel package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=0.25.3 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc_env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc diff --git a/requirements.txt b/requirements.txt index dbe156342..f3016c49e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy>=1.16 -pandas==0.25.3 +pandas==1.2.0 pyarrow==0.17.0 -numba==0.51.2 +numba==0.52.0 tbb tbb-devel diff --git a/sdc/__init__.py b/sdc/__init__.py index 2a514b70a..e9ca063dd 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -28,7 +28,7 @@ # re-export from Numba from numba import (typeof, prange, pndindex, gdb, gdb_breakpoint, gdb_init, - stencil, threading_layer, jitclass, objmode) + stencil, threading_layer, objmode) import sdc.config import sdc.set_ext @@ -48,6 +48,7 @@ import sdc.datatypes.series.init import sdc.extensions.indexes.range_index_ext +import sdc.extensions.indexes.int64_index_ext from ._version import get_versions diff --git a/sdc/_str_ext.cpp b/sdc/_str_ext.cpp index 304d449fb..b5e41ce17 100644 --- a/sdc/_str_ext.cpp +++ b/sdc/_str_ext.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include "_str_decode.cpp" @@ -129,6 +130,7 @@ extern "C" npy_intp array_size(PyArrayObject* arr); void* array_getptr1(PyArrayObject* arr, npy_intp ind); void array_setitem(PyArrayObject* arr, char* p, PyObject* s); + void stable_argsort(char* data_ptr, uint32_t* in_offsets, int64_t len, int8_t ascending, uint64_t* result); PyMODINIT_FUNC PyInit_hstr_ext(void) { @@ -201,6 +203,7 @@ extern "C" PyObject_SetAttrString(m, "array_setitem", PyLong_FromVoidPtr((void*)(&array_setitem))); PyObject_SetAttrString(m, "decode_utf8", PyLong_FromVoidPtr((void*)(&decode_utf8))); PyObject_SetAttrString(m, "get_utf8_size", PyLong_FromVoidPtr((void*)(&get_utf8_size))); + PyObject_SetAttrString(m, "stable_argsort", PyLong_FromVoidPtr((void*)(&stable_argsort))); return m; } @@ -871,4 +874,35 @@ extern "C" return; } + void stable_argsort(char* data_ptr, uint32_t* in_offsets, int64_t len, int8_t ascending, uint64_t* result) + { + using str_index_pair_type = std::pair; + std::vector str_arr_indexed; + str_arr_indexed.reserve(len); + + for (int64_t i=0; i < len; ++i) + { + uint32_t start = in_offsets[i]; + uint32_t size = in_offsets[i + 1] - in_offsets[i]; + str_arr_indexed.emplace_back( + std::move(std::string(&data_ptr[start], size)), + i + ); + } + + std::stable_sort(str_arr_indexed.begin(), + str_arr_indexed.end(), + [=](const str_index_pair_type& left, const str_index_pair_type& right){ + if (ascending) + return left.first < right.first; + else + return left.first > right.first; + } + ); + + for (int64_t i=0; i < len; ++i) + result[i] = str_arr_indexed[i].second; + } + + } // extern "C" diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index bffdc5b30..8924a99be 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -48,14 +48,17 @@ from sdc.functions import numpy_like from sdc.str_arr_type import string_array_type, StringArrayType from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType from sdc.str_arr_ext import (num_total_chars, append_string_array_to, str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type, cp_str_list_to_array, create_str_arr_from_list, get_utf8_size, - str_arr_set_na_by_mask) + str_arr_set_na_by_mask, str_arr_stable_argosort) from sdc.utilities.prange_utils import parallel_chunks from sdc.utilities.utils import sdc_overload, sdc_register_jitable -from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes, - TypeChecker) +from sdc.utilities.sdc_typing_utils import ( + find_common_dtype_from_numpy_dtypes, + TypeChecker) +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types class SDCLimitation(Exception): @@ -71,18 +74,20 @@ def hpat_arrays_append(A, B): def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" - A_is_range_index = isinstance(A, RangeIndexType) - B_is_range_index = isinstance(B, RangeIndexType) - if isinstance(A, (types.Array, RangeIndexType)): - if isinstance(B, (types.Array, RangeIndexType)): + use_A_array = isinstance(A, (RangeIndexType, Int64IndexType)) + use_B_array = isinstance(B, (RangeIndexType, Int64IndexType)) + if isinstance(A, (types.Array, RangeIndexType, Int64IndexType)): + if isinstance(B, (types.Array, RangeIndexType, Int64IndexType)): def _append_single_numeric_impl(A, B): - _A = A.values if A_is_range_index == True else A # noqa - _B = B.values if B_is_range_index == True else B # noqa + _A = A.values if use_A_array == True else A # noqa + _B = B.values if use_B_array == True else B # noqa return numpy.concatenate((_A, _B,)) return _append_single_numeric_impl - elif isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType)): - B_dtype_is_range_index = isinstance(B.dtype, RangeIndexType) + + elif (isinstance(B, (types.UniTuple, types.List)) + and isinstance(B.dtype, (types.Array, RangeIndexType, Int64IndexType))): + B_dtype_is_index = isinstance(B.dtype, (RangeIndexType, Int64IndexType)) numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime @@ -92,10 +97,10 @@ def _append_list_numeric_impl(A, B): new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) - _A = numpy.array(A) if A_is_range_index == True else A # noqa + _A = numpy.array(A) if use_A_array == True else A # noqa new_data[:stop] = _A for arr in B: - _arr = numpy.array(arr) if B_dtype_is_range_index == True else arr # noqa + _arr = arr.values if B_dtype_is_index == True else arr # noqa start = stop stop = start + len(_arr) new_data[start:stop] = _arr @@ -218,12 +223,13 @@ def sdc_join_series_indexes_overload(left, right): """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" # check that both operands are of types used for representing Pandas indexes - if not (isinstance(left, (types.Array, StringArrayType, RangeIndexType)) - and isinstance(right, (types.Array, StringArrayType, RangeIndexType))): + if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types) + and not isinstance(left, types.NoneType) + and not isinstance(right, types.NoneType)): return None - convert_left = isinstance(left, RangeIndexType) - convert_right = isinstance(right, RangeIndexType) + convert_left = isinstance(left, (RangeIndexType, Int64IndexType)) + convert_right = isinstance(right, (RangeIndexType, Int64IndexType)) def _convert_to_arrays_impl(left, right): _left = left.values if convert_left == True else left # noqa @@ -243,10 +249,9 @@ def sdc_join_range_indexes_impl(left, right): return sdc_join_range_indexes_impl - elif isinstance(left, RangeIndexType) and isinstance(right, types.Array): - return _convert_to_arrays_impl - - elif isinstance(left, types.Array) and isinstance(right, RangeIndexType): + elif (isinstance(left, (RangeIndexType, Int64IndexType, types.Array)) + and isinstance(right, (RangeIndexType, Int64IndexType, types.Array)) + and not (isinstance(left, types.Array) and isinstance(right, types.Array))): return _convert_to_arrays_impl # TODO: remove code duplication below and merge numeric and StringArray impls into one @@ -513,7 +518,7 @@ def sdc_arrays_argsort(A, kind='quicksort'): @sdc_overload(sdc_arrays_argsort, jit_options={'parallel': False}) -def sdc_arrays_argsort_overload(A, kind='quicksort'): +def sdc_arrays_argsort_overload(A, kind='quicksort', ascending=True): """Function providing pandas argsort implementation for different 1D array types""" # kind is not known at compile time, so get this function here and use in impl if needed @@ -521,33 +526,31 @@ def sdc_arrays_argsort_overload(A, kind='quicksort'): kind_is_default = isinstance(kind, str) if isinstance(A, types.Array): - def _sdc_arrays_argsort_array_impl(A, kind='quicksort'): + def _sdc_arrays_argsort_array_impl(A, kind='quicksort', ascending=True): _kind = 'quicksort' if kind_is_default == True else kind # noqa - return numpy_like.argsort(A, kind=_kind) + return numpy_like.argsort(A, kind=_kind, ascending=ascending) return _sdc_arrays_argsort_array_impl elif A == string_array_type: - def _sdc_arrays_argsort_str_arr_impl(A, kind='quicksort'): + def _sdc_arrays_argsort_str_arr_impl(A, kind='quicksort', ascending=True): - nan_mask = sdc.hiframes.api.get_nan_mask(A) - idx = numpy.arange(len(A)) - old_nan_positions = idx[nan_mask] - - data = A[~nan_mask] - keys = idx[~nan_mask] if kind == 'quicksort': - zipped = list(zip(list(data), list(keys))) - zipped = quicksort_func(zipped) - argsorted = [zipped[i][1] for i in numpy.arange(len(data))] + indexes = numpy.arange(len(A)) + data_index_pairs = list(zip(list(A), list(indexes))) + zipped = quicksort_func(data_index_pairs) + argsorted = [zipped[i][1] for i in indexes] + res = numpy.array(argsorted, dtype=numpy.int64) + # for non-stable sort the order within groups does not matter + # so just reverse the result when sorting in descending order + if not ascending: + res = res[::-1] elif kind == 'mergesort': - sdc.hiframes.sort.local_sort((data, ), (keys, )) - argsorted = list(keys) + res = str_arr_stable_argosort(A, ascending=ascending) else: raise ValueError("Unrecognized kind of sort in sdc_arrays_argsort") - argsorted.extend(old_nan_positions) - return numpy.asarray(argsorted, dtype=numpy.int32) + return res return _sdc_arrays_argsort_str_arr_impl @@ -618,13 +621,16 @@ def _sdc_take(data, indexes): @sdc_overload(_sdc_take) def _sdc_take_overload(data, indexes): - if not isinstance(data, (types.Array, StringArrayType, RangeIndexType)): + valid_data_types = (types.Array,) + sdc_pandas_index_types + if not (isinstance(data, valid_data_types) and not isinstance(data, types.NoneType)): return None - if not (isinstance(indexes, (types.Array, types.List)) + + if not (isinstance(indexes, (types.Array, types.List, Int64IndexType)) and isinstance(indexes.dtype, (types.Integer, types.ListType))): return None - if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List, RangeIndexType)): + if (isinstance(indexes.dtype, types.ListType) + and isinstance(data, (types.Array, types.List, RangeIndexType, Int64IndexType))): arr_dtype = data.dtype def _sdc_take_list_impl(data, indexes): @@ -677,7 +683,7 @@ def _sdc_take_list_str_impl(data, indexes): return _sdc_take_list_str_impl - elif isinstance(data, (types.Array, RangeIndexType)): + elif isinstance(data, (types.Array, RangeIndexType, Int64IndexType)): arr_dtype = data.dtype def _sdc_take_array_impl(data, indexes): @@ -740,6 +746,7 @@ def sdc_reindex_series_overload(arr, index, name, by_index): """ Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """ range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType) + int64_indexes = isinstance(index, Int64IndexType) and isinstance(by_index, Int64IndexType) data_dtype, index_dtype = arr.dtype, index.dtype data_is_str_arr = isinstance(arr.dtype, types.UnicodeType) @@ -748,6 +755,8 @@ def sdc_reindex_series_impl(arr, index, name, by_index): # no reindexing is needed if indexes are equal if range_indexes == True: # noqa equal_indexes = numpy_like.array_equal(index, by_index) + elif int64_indexes == True: # noqa + equal_indexes = numpy_like.array_equal(index, by_index) else: equal_indexes = False if (index is by_index or equal_indexes): @@ -772,10 +781,10 @@ def sdc_reindex_series_impl(arr, index, name, by_index): map_index_to_position[value] = i index_mismatch = 0 - # FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used - for i in numpy.arange(len(by_index)): - if by_index[i] in map_index_to_position: - pos_in_self = map_index_to_position[by_index[i]] + for i in numba.prange(len(by_index)): + val = by_index[i] + if val in map_index_to_position: + pos_in_self = map_index_to_position[val] _res_data[i] = arr[pos_in_self] if data_is_str_arr == True: # noqa res_data_nan_mask[i] = isna(arr, i) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 31f3738d9..de7edef66 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -50,6 +50,7 @@ gen_impl_generator, find_common_dtype_from_numpy_dtypes) from sdc.str_arr_ext import StringArrayType from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType from sdc.hiframes.pd_dataframe_type import DataFrameType from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps @@ -2257,7 +2258,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx): if accessor == 'at': num_idx = (isinstance(idx[0], types.Number) - and isinstance(self.dataframe.index, (types.Array, types.NoneType, RangeIndexType))) + and isinstance(self.dataframe.index, (types.NoneType, RangeIndexType, Int64IndexType))) str_idx = (isinstance(idx[0], (types.UnicodeType, types.StringLiteral)) and isinstance(self.dataframe.index, StringArrayType)) if isinstance(idx, types.Tuple) and isinstance(idx[1], types.StringLiteral): diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 610a21fc7..1c18ba2e6 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -53,6 +53,7 @@ find_common_dtype_from_numpy_dtypes, has_literal_value, has_python_value) from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType from sdc.datatypes.common_functions import (sdc_join_series_indexes, sdc_arrays_argsort, sdc_reindex_series) from sdc.datatypes.hpat_pandas_rolling_types import ( gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl) @@ -71,6 +72,7 @@ from sdc.hiframes.api import isna from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby from sdc.utilities.prange_utils import parallel_chunks +from sdc.set_ext import build_set from .pandas_series_functions import apply from .pandas_series_functions import map as _map @@ -618,7 +620,7 @@ def sdc_pandas_series_setitem(self, idx, value): def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): if assign_via_idx_mask == True: # noqa - # FIXME_Numba#5157: using asarray since eq impl for RangeIndexType returns list + # FIXME_Numba#5157: using asarray since eq impl for index types returns list _idx = numpy.asarray(self._index == idx) elif assign_via_idx_data == True: # noqa _idx = idx._data @@ -651,7 +653,7 @@ def sdc_pandas_series_setitem_idx_bool_array_align_impl(self, idx, value): # and filtered indexes are looked in value.index, and if found corresponding value is set if value_is_series == True: # noqa value_index, self_index = value.index, self.index - unique_value_indices, unique_self_indices = set(value_index), set(self_index) + unique_value_indices, unique_self_indices = build_set(value_index), build_set(self_index) # pandas behaves differently if value.index has duplicates and if it has no # in case of duplicates in value.index assignment is made via positions @@ -701,7 +703,7 @@ def sdc_pandas_series_setitem_idx_bool_series_align_impl(self, idx, value): # and filtered indexes are either looked in value.index (if value is a Series) # or in self.index (if value is scalar or array) filtered_idx_indices = idx_index[idx._data] - filtered_idx_indices_set = set(filtered_idx_indices) + filtered_idx_indices_set = build_set(filtered_idx_indices) if value_is_series == True: # noqa if len(filtered_idx_indices_set) != len(filtered_idx_indices): @@ -774,7 +776,7 @@ def sdc_pandas_series_setitem_idx_int_series_align_impl(self, idx, value): raise ValueError("Reindexing only valid with uniquely valued Index objects") if len(valid_indices_masked) != idx_size: - raise ValueError("Reindexing not possible: idx has index not found in Series") + raise KeyError("Reindexing not possible: idx has index not found in Series") if value_is_scalar == True: # noqa self._data[valid_indices_positions] = _value @@ -808,7 +810,7 @@ def sdc_pandas_series_setitem_idx_str_series_align_impl(self, idx, value): set_positions[i] = map_index_to_position[index_value] if number_of_found != idx_data_size: - raise ValueError("Reindexing not possible: idx has index not found in Series") + raise KeyError("Reindexing not possible: idx has index not found in Series") if value_is_series == True: # noqa self._data[set_positions] = value._data @@ -2073,7 +2075,7 @@ def hpat_pandas_series_isin_impl(self, values): # return pandas.Series (np.isin (self._data, values)) values = str_list_to_array(list(values)) - values = set(values) + values = build_set(values) data_len = len(self._data) result = numpy.empty(data_len, dtype=numpy.bool_) for i in prange(data_len): @@ -2085,7 +2087,7 @@ def hpat_pandas_series_isin_impl(self, values): # TODO: replace with below line when Numba supports np.isin in nopython mode # return pandas.Series (np.isin (self._data, values)) - values = set(values) + values = build_set(values) data_len = len(self._data) result = numpy.empty(data_len, dtype=numpy.bool_) for i in prange(data_len): @@ -3446,7 +3448,7 @@ def hpat_pandas_series_unique_str_impl(self): Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_unique_str ''' - str_set = set(self._data) + str_set = build_set(self._data) return to_array(str_set) return hpat_pandas_series_unique_str_impl @@ -3578,7 +3580,7 @@ def hpat_pandas_series_nunique_str_impl(self, dropna=True): if dropna: nan_mask = self.isna() data = self._data[~nan_mask._data] - unique_values = set(data) + unique_values = build_set(data) return len(unique_values) return hpat_pandas_series_nunique_str_impl @@ -3591,7 +3593,7 @@ def hpat_pandas_series_nunique_impl(self, dropna=True): data_mask_for_nan = numpy.isnan(self._data) nan_exists = numpy.any(data_mask_for_nan) data_no_nan = self._data[~data_mask_for_nan] - data_set = set(data_no_nan) + data_set = build_set(data_no_nan) if dropna or not nan_exists: return len(data_set) else: @@ -3948,11 +3950,9 @@ def _sdc_pandas_series_sort_values_impl( good = ~data_nan_mask if kind_is_none_or_default == True: # noqa - argsort_res = sdc_arrays_argsort(self._data[good], kind='quicksort') + argsort_res = sdc_arrays_argsort(self._data[good], kind='quicksort', ascending=ascending) else: - argsort_res = sdc_arrays_argsort(self._data[good], kind=kind) - if not ascending: - argsort_res = argsort_res[::-1] + argsort_res = sdc_arrays_argsort(self._data[good], kind=kind, ascending=ascending) idx = numpy.arange(len(self), dtype=numpy.int32) sorted_index = numpy.empty(len(self), dtype=numpy.int32) @@ -4034,7 +4034,8 @@ def hpat_pandas_series_dropna(self, axis=0, inplace=False): ty_checker.raise_exc(inplace, 'bool', 'inplace') if (isinstance(self.data.dtype, types.Number) - and isinstance(self.index, (types.Number, types.NoneType, RangeIndexType))): + and (isinstance(self.index, types.NoneType) + or isinstance(self.index.dtype, types.Number))): def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False): index = self.index return numpy_like.dropna(self._data, index, self._name) diff --git a/sdc/datatypes/int64_index_type.py b/sdc/datatypes/int64_index_type.py new file mode 100644 index 000000000..745d394a7 --- /dev/null +++ b/sdc/datatypes/int64_index_type.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper +) + + +class Int64IndexType(types.IterableType): + dtype = types.int64 + + def __init__(self, data, is_named=False): + self.data = data + self.is_named = is_named + super(Int64IndexType, self).__init__( + name='Int64IndexType({}, {})'.format(data, is_named)) + + @property + def iterator_type(self): + res = self.data.iterator_type + return res + + +@register_model(Int64IndexType) +class Int64IndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + data_type = fe_type.data + name_type = types.unicode_type if fe_type.is_named else types.none + members = [ + ('data', data_type), + ('name', name_type), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(Int64IndexType, 'data', '_data') +make_attribute_wrapper(Int64IndexType, 'name', '_name') diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py new file mode 100644 index 000000000..397698565 --- /dev/null +++ b/sdc/extensions/indexes/indexes_generic.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd + +from numba import types + + +def _check_dtype_param_type(dtype): + """ Returns True is dtype is a valid type for dtype parameter and False otherwise. + Used in RangeIndex ctor and other methods that take dtype parameter. """ + + valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) + return isinstance(dtype, valid_dtype_types) or dtype is None diff --git a/sdc/extensions/indexes/int64_index_ext.py b/sdc/extensions/indexes/int64_index_ext.py new file mode 100644 index 000000000..97db3fd4b --- /dev/null +++ b/sdc/extensions/indexes/int64_index_ext.py @@ -0,0 +1,415 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types, prange +from numba.core import cgutils +from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, ) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature +from numba.core.imputils import impl_ret_untracked, call_getiter + +from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method +from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array, check_signed_integer +from sdc.functions import numpy_like +from numba.core.boxing import box_array, unbox_array +from sdc.hiframes.api import fix_df_index +from sdc.extensions.indexes.indexes_generic import _check_dtype_param_type + + +@intrinsic +def init_int64_index(typingctx, data, name=None): + + if not (isinstance(data, types.Array) and data.dtype is types.int64): + return None + assert data.ndim == 1, "Index data must be 1-dimensional" + + name = types.none if name is None else name + is_named = False if name is types.none else True + + def codegen(context, builder, sig, args): + data_val, name_val = args + # create series struct and store values + int64_index = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + + int64_index.data = data_val + + if is_named: + if isinstance(name, types.StringLiteral): + int64_index.name = numba.cpython.unicode.make_string_from_constant( + context, builder, types.unicode_type, name.literal_value) + else: + int64_index.name = name_val + + if context.enable_nrt: + context.nrt.incref(builder, sig.args[0], data_val) + if is_named: + context.nrt.incref(builder, sig.args[1], name_val) + + return int64_index._getvalue() + + ret_typ = Int64IndexType(data, is_named) + sig = signature(ret_typ, data, name) + return sig, codegen + + +@sdc_overload(pd.Int64Index) +def pd_int64_index_overload(data, dtype=None, copy=False, name=None): + + _func_name = 'pd.Int64Index().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(data, (types.Array, types.List)) and isinstance(data.dtype, types.Integer) + or isinstance(data, (RangeIndexType, Int64IndexType))): + ty_checker.raise_exc(data, 'array/list of integers or integer index', 'data') + + dtype_is_number_class = isinstance(dtype, types.NumberClass) + dtype_is_numpy_signed_int = (check_signed_integer(dtype) + or dtype_is_number_class and check_signed_integer(dtype.dtype)) + dtype_is_unicode_str = isinstance(dtype, (types.UnicodeType, types.StringLiteral)) + if not _check_dtype_param_type(dtype): + ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') + + if not (isinstance(copy, (types.NoneType, types.Omitted, types.Boolean)) or copy is False): + ty_checker.raise_exc(copy, 'bool', 'copy') + + if not (isinstance(name, (types.NoneType, types.Omitted, types.StringLiteral, types.UnicodeType)) or name is None): + ty_checker.raise_exc(name, 'string or none', 'name') + + is_data_array = isinstance(data, types.Array) + is_data_index = isinstance(data, (RangeIndexType, Int64IndexType)) + data_dtype_is_int64 = data.dtype is types.int64 + + def pd_int64_index_ctor_impl(data, dtype=None, copy=False, name=None): + + if not (dtype is None + or dtype_is_numpy_signed_int + or dtype_is_unicode_str and dtype in ('int8', 'int16', 'int32', 'int64')): + raise ValueError("Incorrect `dtype` passed: expected signed integer") + + if is_data_array == True: # noqa + _data = data + elif is_data_index == True: # noqa + _data = data.values + else: + _data = fix_df_index(data)._data + + if data_dtype_is_int64 == False: # noqa + _data = numpy_like.astype(_data, dtype=types.int64) + else: + if copy: + _data = np.copy(_data) + return init_int64_index(_data, name) + + return pd_int64_index_ctor_impl + + +@typeof_impl.register(pd.Int64Index) +def typeof_int64_index(val, c): + index_data_ty = numba.typeof(val._data) + is_named = val.name is not None + return Int64IndexType(index_data_ty, is_named=is_named) + + +@box(Int64IndexType) +def box_int64_index(typ, val, c): + + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + int64_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + data = box_array(typ.data, int64_index.data, c) + + # dtype and copy params are not stored so use default values + dtype = c.pyapi.make_none() + copy = c.pyapi.bool_from_bool( + c.context.get_constant(types.bool_, False) + ) + + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, int64_index.name) + else: + name = c.pyapi.make_none() + + res = c.pyapi.call_method(pd_class_obj, "Int64Index", (data, dtype, copy, name)) + + c.pyapi.decref(data) + c.pyapi.decref(dtype) + c.pyapi.decref(copy) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + + +@unbox(Int64IndexType) +def unbox_int64_index(typ, val, c): + + # TODO: support index unboxing with reference to parent in Numba? + int64_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + index_data = c.pyapi.object_getattr_string(val, "_data") + int64_index.data = unbox_array(typ.data, index_data, c).value + c.pyapi.decref(index_data) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + int64_index.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(int64_index._getvalue(), is_error=is_error) + + +@sdc_overload_attribute(Int64IndexType, 'name') +def pd_int64_index_name_overload(self): + if not isinstance(self, Int64IndexType): + return None + + is_named_index = self.is_named + + def pd_int64_index_name_impl(self): + if is_named_index == True: # noqa + return self._name + else: + return None + + return pd_int64_index_name_impl + + +@sdc_overload_attribute(Int64IndexType, 'dtype') +def pd_int64_index_dtype_overload(self): + if not isinstance(self, Int64IndexType): + return None + + range_index_dtype = self.dtype + + def pd_int64_index_dtype_impl(self): + return range_index_dtype + + return pd_int64_index_dtype_impl + + +@sdc_overload_attribute(Int64IndexType, 'values') +def pd_int64_index_values_overload(self): + if not isinstance(self, Int64IndexType): + return None + + def pd_int64_index_values_impl(self): + return self._data + + return pd_int64_index_values_impl + + +@sdc_overload(len) +def pd_int64_index_len_overload(self): + if not isinstance(self, Int64IndexType): + return None + + def pd_int64_index_len_impl(self): + return len(self._data) + + return pd_int64_index_len_impl + + +@sdc_overload(operator.contains) +def pd_int64_index_contains_overload(self, val): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Operator contains().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(val, types.Integer)): + ty_checker.raise_exc(val, 'integer scalar', 'val') + + def pd_int64_index_contains_impl(self, val): + # TO-DO: add operator.contains support for arrays in Numba + found = 0 + for i in prange(len(self._data)): + if val == self._data[i]: + found += 1 + + return found > 0 + + return pd_int64_index_contains_impl + + +@sdc_overload_method(Int64IndexType, 'copy') +def pd_int64_index_copy_overload(self, name=None, deep=False, dtype=None): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method copy().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): + ty_checker.raise_exc(name, 'string or none', 'name') + + if not (isinstance(deep, (types.Omitted, types.Boolean)) or deep is False): + ty_checker.raise_exc(deep, 'boolean', 'deep') + + if not _check_dtype_param_type(dtype): + ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') + + name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None + keep_name = name_is_none and self.is_named + + def pd_int64_index_copy_impl(self, name=None, deep=False, dtype=None): + + _name = self._name if keep_name == True else name # noqa + new_index_data = self._data if not deep else numpy_like.copy(self._data) + return init_int64_index(new_index_data, _name) + + return pd_int64_index_copy_impl + + +@sdc_overload(operator.getitem) +def pd_int64_index_getitem_overload(self, idx): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Operator getitem().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(idx, (types.Integer, types.SliceType)) + or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): + ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') + + if isinstance(idx, types.Integer): + def pd_int64_index_getitem_impl(self, idx): + index_len = len(self._data) + # FIXME_Numba#5801: Numba type unification rules make this float + idx = types.int64((index_len + idx) if idx < 0 else idx) + if (idx < 0 or idx >= index_len): + raise IndexError("Int64Index.getitem: index is out of bounds") + + return self._data[idx] + + return pd_int64_index_getitem_impl + + else: + def pd_int64_index_getitem_impl(self, idx): + index_data = self._data[idx] + return pd.Int64Index(index_data, name=self._name) + + return pd_int64_index_getitem_impl + + +# TO-DO: this and many other impls are generic and should be moved to indexes_generic.py +@sdc_overload(operator.eq) +def pd_int64_index_eq_overload(self, other): + + self_is_index = isinstance(self, Int64IndexType) + other_is_index = isinstance(other, Int64IndexType) + + if not (self_is_index and other_is_index + or (self_is_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) + or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_index)): + return None + one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) + + def pd_int64_index_eq_impl(self, other): + + if one_operand_is_scalar == False: # noqa + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + # names do not matter when comparing pd.Int64Index + left = self.values if self_is_index == True else self # noqa + right = other.values if other_is_index == True else other # noqa + return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + + return pd_int64_index_eq_impl + + +@sdc_overload(operator.ne) +def pd_int64_index_ne_overload(self, other): + + self_is_index = isinstance(self, Int64IndexType) + other_is_index = isinstance(other, Int64IndexType) + + if not (self_is_index and other_is_index + or (self_is_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) + or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_index)): + return None + + def pd_int64_index_ne_impl(self, other): + + eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list + return list(~eq_res) + + return pd_int64_index_ne_impl + + +@lower_builtin(operator.is_, Int64IndexType, Int64IndexType) +def pd_int64_index_is_overload(context, builder, sig, args): + + ty_lhs, ty_rhs = sig.args + if ty_lhs != ty_rhs: + return cgutils.false_bit + + lhs, rhs = args + lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) + rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) + return builder.icmp_signed('==', lhs_ptr, rhs_ptr) + + +@lower_builtin('getiter', Int64IndexType) +def pd_int64_index_getiter(context, builder, sig, args): + """ Returns a new iterator object for Int64IndexType by delegating to array __iter__ """ + (value,) = args + int64_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) + res = call_getiter(context, builder, sig.args[0].data, int64_index.data) + return impl_ret_untracked(context, builder, Int64IndexType, res) + + +@sdc_overload_method(Int64IndexType, 'ravel') +def pd_int64_index_ravel_overload(self, order='C'): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method ravel().' + + # np.ravel argument order is not supported in Numba + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_int64_index_ravel_impl(self, order='C'): + # np.ravel argument order is not supported in Numba + if order != 'C': + raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") + + return self.values + + return pd_int64_index_ravel_impl diff --git a/sdc/extensions/indexes/range_index_ext.py b/sdc/extensions/indexes/range_index_ext.py index 7b24e7528..cc04cfcba 100644 --- a/sdc/extensions/indexes/range_index_ext.py +++ b/sdc/extensions/indexes/range_index_ext.py @@ -33,23 +33,18 @@ from numba import types from numba.core import cgutils from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, ) - +from numba.core.errors import TypingError from numba.core.typing.templates import signature from numba.core.imputils import impl_ret_untracked, call_getiter from sdc.datatypes.range_index_type import RangeIndexType, RangeIndexDataType from sdc.datatypes.common_functions import SDCLimitation, _sdc_take from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method -from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array +from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array, check_signed_integer from sdc.functions.numpy_like import getitem_by_mask - - -def _check_dtype_param_type(dtype): - """ Returns True is dtype is a valid type for dtype parameter and False otherwise. - Used in RangeIndex ctor and other methods that take dtype parameter. """ - - valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) - return isinstance(dtype, valid_dtype_types) or dtype is None +from sdc.functions.numpy_like import astype as nplike_astype +from numba.core.boxing import box_array, unbox_array +from sdc.extensions.indexes.indexes_generic import _check_dtype_param_type @intrinsic @@ -96,8 +91,9 @@ def pd_range_index_overload(start=None, stop=None, step=None, dtype=None, copy=F if not (isinstance(copy, types.Omitted) or fastpath is None): raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'fastpath': {fastpath}") - dtype_is_np_int64 = dtype is types.NumberClass(types.int64) - dtype_is_np_int32 = dtype is types.NumberClass(types.int32) + dtype_is_number_class = isinstance(dtype, types.NumberClass) + dtype_is_numpy_signed_int = (check_signed_integer(dtype) + or dtype_is_number_class and check_signed_integer(dtype.dtype)) dtype_is_unicode_str = isinstance(dtype, (types.UnicodeType, types.StringLiteral)) if not _check_dtype_param_type(dtype): ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') @@ -125,10 +121,8 @@ def pd_range_index_ctor_dummy_impl( def pd_range_index_ctor_impl(start=None, stop=None, step=None, dtype=None, copy=False, name=None, fastpath=None): if not (dtype is None - or dtype_is_unicode_str and dtype == 'int64' - or dtype_is_unicode_str and dtype == 'int32' - or dtype_is_np_int64 - or dtype_is_np_int32): + or dtype_is_numpy_signed_int + or dtype_is_unicode_str and dtype in ('int8', 'int16', 'int32', 'int64')): raise ValueError("Incorrect `dtype` passed: expected signed integer") # TODO: add support of int32 type @@ -356,7 +350,8 @@ def pd_range_index_getitem_overload(self, idx): if isinstance(idx, types.Integer): def pd_range_index_getitem_impl(self, idx): range_len = len(self._data) - idx = (range_len + idx) if idx < 0 else idx + # FIXME_Numba#5801: Numba type unification rules make this float + idx = types.int64((range_len + idx) if idx < 0 else idx) if (idx < 0 or idx >= range_len): raise IndexError("RangeIndex.getitem: index is out of bounds") return self.start + self.step * idx @@ -375,12 +370,12 @@ def pd_range_index_getitem_impl(self, idx): return pd_range_index_getitem_impl - # returns np.array which is used to represent pandas Int64Index now if isinstance(idx, (types.Array, types.List)): if isinstance(idx.dtype, types.Integer): def pd_range_index_getitem_impl(self, idx): - return _sdc_take(self, idx) + res_as_arr = _sdc_take(self, idx) + return pd.Int64Index(res_as_arr, name=self._name) return pd_range_index_getitem_impl elif isinstance(idx.dtype, types.Boolean): @@ -398,7 +393,7 @@ def pd_range_index_eq_overload(self, other): if not (self_is_range_index and other_is_range_index or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number) and other_is_range_index))): + or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_range_index)): return None one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) @@ -424,7 +419,7 @@ def pd_range_index_ne_overload(self, other): if not (self_is_range_index and other_is_range_index or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number) and other_is_range_index))): + or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_range_index)): return None def pd_range_index_ne_impl(self, other): @@ -453,5 +448,25 @@ def pd_range_index_getiter(context, builder, sig, args): """ Returns a new iterator object for RangeIndexType by delegating to range.__iter__ """ (value,) = args range_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) - res = call_getiter(context, builder, types.range_state64_type, range_index.data) + res = call_getiter(context, builder, RangeIndexDataType, range_index.data) return impl_ret_untracked(context, builder, RangeIndexType, res) + + +@sdc_overload_method(RangeIndexType, 'ravel') +def pd_range_index_ravel_overload(self, order='C'): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method ravel().' + + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_range_index_ravel_impl(self, order='C'): + # np.ravel argument order is not supported in Numba + if order != 'C': + raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") + + return self.values + + return pd_range_index_ravel_impl diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index 1e67e8ccd..aa9e00a70 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -47,6 +47,7 @@ from sdc.functions.statistics import skew_formula from sdc.hiframes.api import isna from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType from sdc.utilities.sdc_typing_utils import TypeChecker, is_default from sdc.utilities.utils import (sdc_overload, sdc_register_jitable, min_dtype_int_val, max_dtype_int_val, min_dtype_float_val, @@ -57,6 +58,8 @@ from sdc.utilities.prange_utils import parallel_chunks from sdc.utilities.sdc_typing_utils import check_types_comparable from sdc.functions.sort import parallel_sort, parallel_stable_sort, parallel_argsort, parallel_stable_argsort +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types + def astype(self, dtype): pass @@ -120,7 +123,9 @@ def sdc_astype_overload(self, dtype): """ ty_checker = TypeChecker("numpy-like 'astype'") - if not isinstance(self, (types.Array, StringArrayType, RangeIndexType)): + valid_self_types = (types.Array,) + sdc_pandas_index_types + if not (isinstance(self, valid_self_types) + and not isinstance(self, types.NoneType)): return None accepted_dtype_types = (types.functions.NumberClass, types.Function, types.StringLiteral) @@ -156,7 +161,7 @@ def sdc_astype_number_to_string_impl(self, dtype): return sdc_astype_number_to_string_impl - if (isinstance(self, (types.Array, RangeIndexType)) + if (isinstance(self, (types.Array, RangeIndexType, Int64IndexType)) and isinstance(dtype, (types.StringLiteral, types.functions.NumberClass))): def sdc_astype_number_impl(self, dtype): arr = numpy.empty(len(self), dtype=numpy.dtype(dtype)) @@ -344,7 +349,9 @@ def sdc_copy_overload(self): Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k copy """ - if not isinstance(self, (types.Array, StringArrayType, RangeIndexType)): + valid_self_types = (types.Array,) + sdc_pandas_index_types + if not (isinstance(self, valid_self_types) + and not isinstance(self, types.NoneType)): return None if isinstance(self, types.Array): @@ -360,7 +367,7 @@ def sdc_copy_array_impl(self): return sdc_copy_array_impl - if isinstance(self, (StringArrayType, RangeIndexType)): + if isinstance(self, (StringArrayType, RangeIndexType, Int64IndexType)): def sdc_copy_str_arr_impl(self): return self.copy() @@ -953,7 +960,7 @@ def getitem_by_mask(arr, idx): @sdc_overload(getitem_by_mask) -def getitem_by_mask_overload(arr, idx): +def getitem_by_mask_overload(self, idx): """ Creates a new array from arr by selecting elements indicated by Boolean mask idx. @@ -971,13 +978,17 @@ def getitem_by_mask_overload(arr, idx): """ - if not isinstance(arr, (types.Array, StringArrayType, RangeIndexType)): - return + valid_self_types = (types.Array,) + sdc_pandas_index_types + if not (isinstance(self, valid_self_types) + and not isinstance(self, types.NoneType)): + return None - res_dtype = arr.dtype - is_str_arr = arr == string_array_type - def getitem_by_mask_impl(arr, idx): - chunks = parallel_chunks(len(arr)) + res_dtype = self.dtype + is_str_arr = self == string_array_type + is_numeric_index = isinstance(self, (RangeIndexType, Int64IndexType)) + + def getitem_by_mask_impl(self, idx): + chunks = parallel_chunks(len(self)) arr_len = numpy.empty(len(chunks), dtype=numpy.int64) length = 0 @@ -1002,16 +1013,18 @@ def getitem_by_mask_impl(arr, idx): for j in range(chunk.start, chunk.stop): if idx[j]: - value = arr[j] + value = self[j] result_data[current_pos] = value if is_str_arr == True: # noqa - result_nan_mask[current_pos] = isna(arr, j) + result_nan_mask[current_pos] = isna(self, j) current_pos += 1 if is_str_arr == True: # noqa result_data_as_str_arr = create_str_arr_from_list(result_data) str_arr_set_na_by_mask(result_data_as_str_arr, result_nan_mask) return result_data_as_str_arr + elif is_numeric_index == True: # noqa + return pandas.Int64Index(result_data, name=self._name) else: return result_data @@ -1088,8 +1101,8 @@ def array_equal(A, B): def sdc_array_equal_overload(A, B): """ Checks 1D sequences A and B of comparable dtypes are equal """ - if not (isinstance(A, (types.Array, StringArrayType, types.NoneType, RangeIndexType)) - or isinstance(B, (types.Array, StringArrayType, types.NoneType, RangeIndexType))): + valid_arg_types = (types.Array,) + sdc_pandas_index_types + if not (isinstance(A, valid_arg_types) or isinstance(B, valid_arg_types)): return None _func_name = "numpy-like 'array_equal'" @@ -1142,6 +1155,9 @@ def sdc_np_array_overload(A): if isinstance(A, RangeIndexType): return lambda A: np.arange(A.start, A.stop, A.step) + if isinstance(A, Int64IndexType): + return lambda A: A._data + if isinstance(A, types.containers.Set): # TODO: naive implementation, data from set can probably # be copied to array more efficienty @@ -1209,7 +1225,7 @@ def sort_impl(a, axis=-1, kind=None, order=None): return sort_impl -def argsort(a, axis=-1, kind=None, order=None): +def argsort(a, axis=-1, kind=None, order=None, ascending=True): """ Returns the indices that would sort an array. @@ -1238,7 +1254,7 @@ def argsort(a, axis=-1, kind=None, order=None): @sdc_overload(argsort) -def argsort_overload(a, axis=-1, kind=None, order=None): +def argsort_overload(a, axis=-1, kind=None, order=None, ascending=True): _func_name = 'argsort' ty_checker = TypeChecker(_func_name) @@ -1250,15 +1266,15 @@ def argsort_overload(a, axis=-1, kind=None, order=None): if not is_default(order, None): raise TypingError(f'{_func_name} Unsupported parameter order') - def argsort_impl(a, axis=-1, kind=None, order=None): + def argsort_impl(a, axis=-1, kind=None, order=None, ascending=True): _kind = 'quicksort' if kind is not None: _kind = kind if _kind == 'quicksort': - return parallel_argsort(a) + return parallel_argsort(a, ascending) elif _kind == 'mergesort': - return parallel_stable_argsort(a) + return parallel_stable_argsort(a, ascending) else: raise ValueError("Unsupported value of 'kind' parameter") diff --git a/sdc/functions/sort.py b/sdc/functions/sort.py index 7ddbc8f31..78a332d12 100644 --- a/sdc/functions/sort.py +++ b/sdc/functions/sort.py @@ -47,7 +47,7 @@ def bind(sym, sig): parallel_sort_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_uint64, ct.c_uint64, ct.c_void_p,) -parallel_argsort_arithm_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64) +parallel_argsort_arithm_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64, ct.c_uint8) parallel_argsort_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64, ct.c_uint64, ct.c_void_p,) @@ -66,7 +66,7 @@ def bind(sym, sig): parallel_sort_t_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_uint64) -parallel_argsort_t_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64) +parallel_argsort_t_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64, ct.c_uint8) set_threads_count_sig = ct.CFUNCTYPE(None, ct.c_uint64) set_threads_count_sym = bind('set_number_of_threads', set_threads_count_sig) @@ -290,17 +290,19 @@ def parallel_xargsort_overload_impl(dt, xargsort_map, xargsort_sym): if dt in types_to_postfix.keys(): sort_f = xargsort_map[dt] - def parallel_xargsort_arithm_impl(arr): + def parallel_xargsort_arithm_impl(arr, ascending=True): index = numpy.empty(shape=len(arr), dtype=numpy.int64) - sort_f(index.ctypes, arr.ctypes, len(arr)) + sort_f(index.ctypes, arr.ctypes, len(arr), types.uint8(ascending)) return index return parallel_xargsort_arithm_impl - def parallel_xargsort_impl(arr): + # TO-DO: add/change adaptor to handle case of ascending=False + def parallel_xargsort_impl(arr, ascending=True): item_size = itemsize(arr) index = numpy.empty(shape=len(arr), dtype=numpy.int64) + xargsort_sym(index.ctypes, arr.ctypes, len(arr), item_size, adaptor(arr[0], arr[0])) return index @@ -308,12 +310,12 @@ def parallel_xargsort_impl(arr): return parallel_xargsort_impl -def parallel_argsort(arr): +def parallel_argsort(arr, ascending=True): pass @overload(parallel_argsort) -def parallel_argsort_overload(arr): +def parallel_argsort_overload(arr, ascending=True): if not isinstance(arr, types.Array): raise NotImplementedError @@ -323,12 +325,12 @@ def parallel_argsort_overload(arr): return parallel_xargsort_overload_impl(dt, argsort_map, parallel_argsort_sym) -def parallel_stable_argsort(arr): +def parallel_stable_argsort(arr, ascending=True): pass @overload(parallel_stable_argsort) -def parallel_argsort_overload(arr): +def parallel_stable_argsort_overload(arr, ascending=True): if not isinstance(arr, types.Array): raise NotImplementedError diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 77436f49b..8a0b7f622 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -39,6 +39,7 @@ from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (StringArrayType, string_array_type) from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType from sdc.hiframes.pd_series_ext import ( SeriesType, if_series_to_array_type) @@ -160,7 +161,7 @@ def fix_df_array_list_str_impl(column): # pragma: no cover if isinstance(column, SeriesType): return lambda column: column._data - if isinstance(column, RangeIndexType): + if isinstance(column, (RangeIndexType, Int64IndexType)): return lambda column: np.array(column) if isinstance(column, (types.Array, StringArrayType, Categorical)): @@ -179,10 +180,16 @@ def fix_df_index_overload(index): def fix_df_index_impl(index): return None - elif isinstance(index, RangeIndexType): + elif isinstance(index, (RangeIndexType, Int64IndexType)): def fix_df_index_impl(index): return index + # currently only signed integer indexes are represented with own type + # TO-DO: support Uint64Index and Float64Indexes + elif isinstance(index.dtype, types.Integer) and index.dtype.signed: + def fix_df_index_impl(index): + index_data = fix_df_array(index) + return pd.Int64Index(index_data) else: # default case, transform index the same as df data def fix_df_index_impl(index): diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 5e6930da9..12cb8850f 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -47,15 +47,17 @@ from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical from sdc.hiframes.pd_series_ext import SeriesType from sdc.hiframes.pd_series_type import _get_series_array_type - from sdc.hiframes.pd_dataframe_ext import get_structure_maps +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types from .. import hstr_ext import llvmlite.binding as ll from llvmlite import ir as lir from llvmlite.llvmpy.core import Type as LLType from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType from sdc.extensions.indexes.range_index_ext import box_range_index, unbox_range_index +from sdc.extensions.indexes.int64_index_ext import box_int64_index, unbox_int64_index from sdc.str_arr_type import StringArrayType ll.add_symbol('array_size', hstr_ext.array_size) ll.add_symbol('array_getptr1', hstr_ext.array_getptr1) @@ -192,6 +194,8 @@ def _infer_series_list_dtype(S): def _infer_index_type(index): """ Deduces native Numba type used to represent index Python object """ + + # more specific types go first (e.g. RangeIndex is subtype of Int64Index) if isinstance(index, pd.RangeIndex): # depending on actual index value unbox to diff types: none-index if it matches # positions or to RangeIndexType in general case @@ -206,6 +210,14 @@ def _infer_index_type(index): # for unsupported pandas indexes we explicitly unbox to None if isinstance(index, pd.DatetimeIndex): return types.none + + if isinstance(index, pd.Int64Index): + index_data_type = numba.typeof(index._data) + if index.name is None: + return Int64IndexType(index_data_type) + else: + return Int64IndexType(index_data_type, is_named=True) + if index.dtype == np.dtype('O'): # TO-DO: should we check that all elements are strings? if len(index) > 0 and isinstance(index[0], str): @@ -323,9 +335,14 @@ def _unbox_index_data(index_typ, index_obj, c): if isinstance(index_typ, RangeIndexType): return unbox_range_index(index_typ, index_obj, c) + if isinstance(index_typ, Int64IndexType): + return unbox_int64_index(index_typ, index_obj, c) + if index_typ == string_array_type: return unbox_str_series(index_typ, index_obj, c) + # this is still here only because of Float64Index represented as array + # TO-DO: remove when it's added if isinstance(index_typ, types.Array): index_data = c.pyapi.object_getattr_string(index_obj, "_data") res = unbox_array(index_typ, index_data, c) @@ -437,10 +454,12 @@ def _box_index_data(index_typ, val, c): c: LLVM context object Returns: Python object native value is boxed into """ - assert isinstance(index_typ, (RangeIndexType, StringArrayType, types.Array, types.NoneType)) + assert isinstance(index_typ, sdc_pandas_index_types) if isinstance(index_typ, RangeIndexType): index = box_range_index(index_typ, val, c) + elif isinstance(index_typ, Int64IndexType): + index = box_int64_index(index_typ, val, c) elif isinstance(index_typ, types.Array): index = box_array(index_typ, val, c) elif isinstance(index_typ, StringArrayType): diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index d48aaf0f1..0f062a4bf 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -48,7 +48,6 @@ from sdc.hiframes.pd_series_type import SeriesType from sdc.datatypes.categorical.pdimpl import is_categoricaldtype from sdc.datatypes.series.pdimpl import _Series_category -from sdc.datatypes.range_index_type import RangeIndexType def is_str_series_typ(t): diff --git a/sdc/native/module.cpp b/sdc/native/module.cpp index c0d9ff606..1571508ab 100644 --- a/sdc/native/module.cpp +++ b/sdc/native/module.cpp @@ -60,31 +60,31 @@ extern "C" void parallel_argsort_u64v(void* index, void* begin, uint64_t len, uint64_t size, void* compare); - void parallel_argsort_u64i8(void* index, void* begin, uint64_t len); - void parallel_argsort_u64u8(void* index, void* begin, uint64_t len); - void parallel_argsort_u64i16(void* index, void* begin, uint64_t len); - void parallel_argsort_u64u16(void* index, void* begin, uint64_t len); - void parallel_argsort_u64i32(void* index, void* begin, uint64_t len); - void parallel_argsort_u64u32(void* index, void* begin, uint64_t len); - void parallel_argsort_u64i64(void* index, void* begin, uint64_t len); - void parallel_argsort_u64u64(void* index, void* begin, uint64_t len); - - void parallel_argsort_u64f32(void* index, void* begin, uint64_t len); - void parallel_argsort_u64f64(void* index, void* begin, uint64_t len); + void parallel_argsort_u64i8(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64u8(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64i16(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64u16(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64i32(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64u32(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64i64(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64u64(void* index, void* begin, uint64_t len, uint8_t ascending); + + void parallel_argsort_u64f32(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64f64(void* index, void* begin, uint64_t len, uint8_t ascending); void parallel_stable_argsort_u64v(void* index, void* begin, uint64_t len, uint64_t size, void* compare); - void parallel_stable_argsort_u64i8(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64u8(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64i16(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64u16(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64i32(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64u32(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64i64(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64u64(void* index, void* begin, uint64_t len); - - void parallel_stable_argsort_u64f32(void* index, void* begin, uint64_t len); - void parallel_stable_argsort_u64f64(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64i8(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64u8(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64i16(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64u16(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64i32(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64u32(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64i64(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64u64(void* index, void* begin, uint64_t len, uint8_t ascending); + + void parallel_stable_argsort_u64f32(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64f64(void* index, void* begin, uint64_t len, uint8_t ascending); void set_number_of_threads(uint64_t threads) { diff --git a/sdc/native/sort.cpp b/sdc/native/sort.cpp index 2e92c46b9..9a6b96117 100644 --- a/sdc/native/sort.cpp +++ b/sdc/native/sort.cpp @@ -92,8 +92,16 @@ void parallel_argsort_(I* index, void* data, uint64_t len, uint64_t size, compar } // namespace #define declare_single_argsort(index_prefix, type_prefix, ity, ty) \ -void parallel_argsort_##index_prefix##type_prefix(void* index, void* begin, uint64_t len) \ -{ parallel_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len); } +void parallel_argsort_##index_prefix##type_prefix(void* index, void* begin, uint64_t len, uint8_t ascending) \ +{ \ + if (ascending) { \ + auto cmp = utils::less(); \ + parallel_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ + } else { \ + auto cmp = utils::greater(); \ + parallel_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ + } \ +} #define declare_argsort(prefix, ty) \ declare_single_argsort(u8, prefix, uint8_t, ty) \ diff --git a/sdc/native/stable_sort.cpp b/sdc/native/stable_sort.cpp index 38b5e2758..05685086f 100644 --- a/sdc/native/stable_sort.cpp +++ b/sdc/native/stable_sort.cpp @@ -281,8 +281,16 @@ struct parallel_sort_fixed_size } // namespace #define declare_single_argsort(index_prefix, type_prefix, ity, ty) \ -void parallel_stable_argsort_##index_prefix##type_prefix(ity* index, void* begin, uint64_t len) \ -{ parallel_stable_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len); } +void parallel_stable_argsort_##index_prefix##type_prefix(ity* index, void* begin, uint64_t len, uint8_t ascending) \ +{ \ + if (ascending) { \ + auto cmp = utils::less(); \ + parallel_stable_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ + } else { \ + auto cmp = utils::greater(); \ + parallel_stable_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ + } \ +} #define declare_argsort(prefix, ty) \ declare_single_argsort(u8, prefix, uint8_t, ty) \ @@ -339,4 +347,4 @@ void parallel_stable_sort(void* begin, uint64_t len, uint64_t size, void* compar #undef declare_int_sort #undef declare_sort #undef declare_argsort -#undef declare_single_argsort \ No newline at end of file +#undef declare_single_argsort diff --git a/sdc/native/utils.cpp b/sdc/native/utils.cpp index 8067c094d..7d5985d25 100644 --- a/sdc/native/utils.cpp +++ b/sdc/native/utils.cpp @@ -169,4 +169,16 @@ bool nanless(const double& left, const double& right) return std::less()(left, right) || (std::isnan(right) && !std::isnan(left)); } +template<> +bool nangreater(const float& left, const float& right) +{ + return std::greater()(left, right) || (std::isnan(right) && !std::isnan(left)); +} + +template<> +bool nangreater(const double& left, const double& right) +{ + return std::greater()(left, right) || (std::isnan(right) && !std::isnan(left)); +} + } diff --git a/sdc/native/utils.hpp b/sdc/native/utils.hpp index d3633ca63..5be6b327b 100644 --- a/sdc/native/utils.hpp +++ b/sdc/native/utils.hpp @@ -266,6 +266,27 @@ struct less } }; +template +bool nangreater(const T& left, const T& right) +{ + return std::greater()(left, right); +} + +template<> +bool nangreater(const float& left, const float& right); + +template<> +bool nangreater(const double& left, const double& right); + +template +struct greater +{ + bool operator() (const T& left, const T& right) const + { + return nangreater(left, right); + } +}; + namespace tbb_control { void init(); diff --git a/sdc/set_ext.py b/sdc/set_ext.py index 645cf2df6..1223c637a 100644 --- a/sdc/set_ext.py +++ b/sdc/set_ext.py @@ -128,6 +128,7 @@ def _build_str_set_impl(A): str_set.add(_str) return str_set + # TODO: remove since probably unused @overload(set) def init_set_string_array(A): diff --git a/sdc/str_arr_ext.py b/sdc/str_arr_ext.py index 1d12a80a5..e929785f5 100644 --- a/sdc/str_arr_ext.py +++ b/sdc/str_arr_ext.py @@ -54,6 +54,50 @@ from sdc.utilities.sdc_typing_utils import check_is_array_of_dtype +ll.add_symbol('get_str_len', hstr_ext.get_str_len) +ll.add_symbol('allocate_string_array', hstr_ext.allocate_string_array) +ll.add_symbol('setitem_string_array', hstr_ext.setitem_string_array) +ll.add_symbol('getitem_string_array', hstr_ext.getitem_string_array) +ll.add_symbol('getitem_string_array_std', hstr_ext.getitem_string_array_std) +ll.add_symbol('is_na', hstr_ext.is_na) +ll.add_symbol('string_array_from_sequence', hstr_ext.string_array_from_sequence) +ll.add_symbol('np_array_from_string_array', hstr_ext.np_array_from_string_array) +ll.add_symbol('print_int', hstr_ext.print_int) +ll.add_symbol('convert_len_arr_to_offset', hstr_ext.convert_len_arr_to_offset) +ll.add_symbol('set_string_array_range', hstr_ext.set_string_array_range) +ll.add_symbol('str_arr_to_int64', hstr_ext.str_arr_to_int64) +ll.add_symbol('str_arr_to_float64', hstr_ext.str_arr_to_float64) +ll.add_symbol('dtor_string_array', hstr_ext.dtor_string_array) +ll.add_symbol('c_glob', hstr_ext.c_glob) +ll.add_symbol('decode_utf8', hstr_ext.decode_utf8) +ll.add_symbol('get_utf8_size', hstr_ext.get_utf8_size) +ll.add_symbol('stable_argsort', hstr_ext.stable_argsort) + + +convert_len_arr_to_offset = types.ExternalFunction("convert_len_arr_to_offset", + types.void(types.voidptr, + types.intp)) + +setitem_string_array = types.ExternalFunction("setitem_string_array", + types.void(types.voidptr, + types.voidptr, + types.intp, + string_type, + types.intp)) + +_get_utf8_size = types.ExternalFunction("get_utf8_size", + types.intp(types.voidptr, # data_ptr + types.intp, # length + types.int32)) # kind + +_stable_argsort = types.ExternalFunction("stable_argsort", + types.void(types.intp, # data_ptr + types.intp, # offset_ptr + types.uint64, # data size + types.int8, # ascending + types.intp)) # result ptr + + @typeof_impl.register(StringArray) def typeof_string_array(val, c): return string_array_type @@ -523,34 +567,6 @@ def str_arr_len(str_arr): return str_arr_len -ll.add_symbol('get_str_len', hstr_ext.get_str_len) -ll.add_symbol('allocate_string_array', hstr_ext.allocate_string_array) -ll.add_symbol('setitem_string_array', hstr_ext.setitem_string_array) -ll.add_symbol('getitem_string_array', hstr_ext.getitem_string_array) -ll.add_symbol('getitem_string_array_std', hstr_ext.getitem_string_array_std) -ll.add_symbol('is_na', hstr_ext.is_na) -ll.add_symbol('string_array_from_sequence', hstr_ext.string_array_from_sequence) -ll.add_symbol('np_array_from_string_array', hstr_ext.np_array_from_string_array) -ll.add_symbol('print_int', hstr_ext.print_int) -ll.add_symbol('convert_len_arr_to_offset', hstr_ext.convert_len_arr_to_offset) -ll.add_symbol('set_string_array_range', hstr_ext.set_string_array_range) -ll.add_symbol('str_arr_to_int64', hstr_ext.str_arr_to_int64) -ll.add_symbol('str_arr_to_float64', hstr_ext.str_arr_to_float64) -ll.add_symbol('dtor_string_array', hstr_ext.dtor_string_array) -ll.add_symbol('c_glob', hstr_ext.c_glob) -ll.add_symbol('decode_utf8', hstr_ext.decode_utf8) -ll.add_symbol('get_utf8_size', hstr_ext.get_utf8_size) - -convert_len_arr_to_offset = types.ExternalFunction("convert_len_arr_to_offset", types.void(types.voidptr, types.intp)) - - -setitem_string_array = types.ExternalFunction("setitem_string_array", - types.void(types.voidptr, types.voidptr, types.intp, string_type, - types.intp)) -_get_utf8_size = types.ExternalFunction("get_utf8_size", - types.intp(types.voidptr, types.intp, types.int32)) - - def construct_string_array(context, builder): """Creates meminfo and sets dtor. """ @@ -1444,3 +1460,14 @@ def sdc_str_arr_operator_is(context, builder, sig, args): ma = builder.ptrtoint(a.meminfo, cgutils.intp_t) mb = builder.ptrtoint(b.meminfo, cgutils.intp_t) return builder.icmp_signed('==', ma, mb) + + +@numba.njit(no_cpython_wrapper=True) +def str_arr_stable_argosort(arr, ascending=True): + argsort_res = np.empty(len(arr), dtype=np.int64) + _stable_argsort(get_data_ptr(arr).data, + get_offset_ptr(arr).data, + len(arr), + types.int8(ascending), + argsort_res.ctypes.data) + return argsort_res diff --git a/sdc/tests/__init__.py b/sdc/tests/__init__.py index eaba6a8a9..eeb4014b8 100644 --- a/sdc/tests/__init__.py +++ b/sdc/tests/__init__.py @@ -44,7 +44,7 @@ from sdc.tests.test_io import * from sdc.tests.test_hpat_jit import * -from sdc.tests.test_indexes import * +from sdc.tests.indexes import * from sdc.tests.test_sdc_numpy import * from sdc.tests.test_prange_utils import * diff --git a/sdc/tests/indexes/__init__.py b/sdc/tests/indexes/__init__.py new file mode 100644 index 000000000..756d8cb55 --- /dev/null +++ b/sdc/tests/indexes/__init__.py @@ -0,0 +1,29 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from sdc.tests.indexes.test_range_index import TestRangeIndex +from sdc.tests.indexes.test_int64_index import TestInt64Index +from sdc.tests.indexes.test_indexes import TestIndexes diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py new file mode 100644 index 000000000..ba1ea5700 --- /dev/null +++ b/sdc/tests/indexes/index_datagens.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +from itertools import (combinations_with_replacement, filterfalse, chain) +from sdc.tests.test_utils import gen_strlist + + +test_global_index_names = [None, 'abc', 'index'] +test_global_range_member_values = [1, 2, 10, -5, 0, None] + + +def _generate_valid_range_params(): + + def valid_params_predicate(range_params): + # if step is zero or all start/stop/step are None range is invalid + return (range_params[-1] == 0 + or all(map(lambda x: x is None, range_params))) + + return filterfalse( + valid_params_predicate, + combinations_with_replacement(test_global_range_member_values, 3) + ) + + +def _generate_range_indexes_fixed(size, start=1, step=3): + yield pd.RangeIndex(size) + yield pd.RangeIndex(size, name='abc') + yield pd.RangeIndex(stop=step * size, step=step) + yield pd.RangeIndex(stop=2*step*size, step=2*step) + yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) + yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) + + +def _generate_index_param_values(n): + return chain( + [None], + _generate_range_indexes_fixed(n), + _generate_int64_indexes_fixed(n), + [np.arange(n) / 2], + [np.arange(n, dtype=np.uint64)], + [gen_strlist(n)], + ) + + +def _generate_valid_int64_index_data(): + n = 100 + yield np.arange(n) + yield np.arange(n) % 2 + yield np.ones(n, dtype=np.int16) + yield list(np.arange(n)) + yield pd.RangeIndex(n) + yield pd.Int64Index(np.arange(n)) + yield np.arange(n) * 2 + yield np.arange(2 * n) + + +def _generate_int64_indexes_fixed(size): + yield pd.Int64Index(np.arange(size)) + yield pd.Int64Index(np.arange(size), name='abc') + yield pd.Int64Index([i if i % 2 else 0 for i in range(size)]) + yield pd.Int64Index([i // 2 for i in range(size)]) + yield pd.Int64Index(np.ones(size)) diff --git a/sdc/tests/indexes/test_indexes.py b/sdc/tests/indexes/test_indexes.py new file mode 100644 index 000000000..fa8bf6f71 --- /dev/null +++ b/sdc/tests/indexes/test_indexes.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +import unittest + +from sdc.tests.indexes import TestRangeIndex, TestInt64Index +from sdc.tests.indexes.index_datagens import _generate_index_param_values + + +class TestIndexes( + TestRangeIndex, + TestInt64Index + ): + """ This suite combines tests from all concrete index-type suites and also adds + tests for common use-cases that need to be checked for all index-types. """ + + def assert_indexes_equal(self, index1, index2): + # for SDC indexes that are represented with arrays (e.g. Uint64Index) + supported_pandas_indexes = (pd.RangeIndex, pd.Int64Index, ) + if (not isinstance(index1, supported_pandas_indexes) + or not isinstance(index2, supported_pandas_indexes)): + index1 = np.asarray(index1) + index2 = np.asarray(index2) + np.testing.assert_array_equal(index1, index2) + else: + pd.testing.assert_index_equal(index1, index2) + + @unittest.skip("TODO: support boxing/unboxing and parent ref for Python ranges in Numba") + def test_indexes_unbox_data_id_check(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.RangeIndex(n, name='abc'), # only this one fails, other pass + pd.Int64Index(np.arange(n), name='abc'), + ] + data_attr_names_map = { + pd.RangeIndex: '_range', + pd.Int64Index: '_data', + } + + for index in indexes_to_test: + with self.subTest(index_type=type(index)): + result = sdc_func(index) + result_ref = test_impl(index) + + data1, data2, data3 = map( + lambda x: getattr(x, data_attr_names_map[type(x)]), + [index, result, result_ref] + ) + self.assertIs(data1, data3) + self.assertIs(data2, data3) + + @unittest.skip("Needs writable native struct type members in Numba") + def test_indexes_named_set_name(self): + def test_impl(index): + index.name = 'def' + return index + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.RangeIndex(n, name='abc'), + pd.Int64Index(np.arange(n), name='abc'), + ] + + for index in indexes_to_test: + with self.subTest(index_type=type(index)): + index1 = index.copy(deep=True) + index2 = index.copy(deep=True) + result = sdc_func(index1) + result_ref = test_impl(index2) + pd.testing.assert_index_equal(result, result_ref) + + @unittest.skip("Needs writable native struct type members and single common type for name") + def test_indexes_unnamed_set_name(self): + def test_impl(index): + index.name = 'def' + return index + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.RangeIndex(n), + pd.Int64Index(np.arange(n)), + ] + + for index in indexes_to_test: + with self.subTest(index_type=type(index)): + index1 = index.copy(deep=True) + index2 = index.copy(deep=True) + result = sdc_func(index1) + result_ref = test_impl(index2) + pd.testing.assert_index_equal(result, result_ref) + + @unittest.skip("Need support unboxing pandas indexes with parent ref") + def test_indexes_operator_is_unbox(self): + def test_impl(index1, index2): + return index1 is index2 + sdc_func = self.jit(test_impl) + + indexes_to_test = [ + pd.RangeIndex(1, 21, 3), + pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), + ] + + for index in indexes_to_test: + # positive testcase + with self.subTest(subtest="same indexes"): + index1 = index.copy(deep=True) + index2 = index1 + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + index1 = index.copy(deep=True) + index2 = index.copy(deep=True) + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_indexes_unbox_series_with_index(self): + @self.jit + def test_impl(S): + # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, + # but this requires separate type (e.g. DefaultIndexType) instead of types.none as default index + return S.index + + n = 11 + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(series_index=index): + S = pd.Series(np.ones(n), index=index) + result = test_impl(S) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_create_series_with_index(self): + @self.jit + def test_impl(data, index): + S = pd.Series(data=data, index=index) + return S.index + + n = 11 + series_data = np.ones(n) + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(series_index=index): + result = test_impl(series_data, index) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_box_series_with_index(self): + def test_impl(data, index): + return pd.Series(data=data, index=index) + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.ones(n) + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + result = sdc_func(series_data, index) + result_ref = test_impl(series_data, index) + pd.testing.assert_series_equal(result, result_ref) + + def test_indexes_index_get_series_index(self): + def test_impl(S): + return S.index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + S = pd.Series(np.ones(n), index=index) + result = sdc_func(S) + result_ref = test_impl(S) + self.assert_indexes_equal(result, result_ref) + + def test_indexes_index_unbox_df_with_index(self): + @self.jit + def test_impl(df): + # TO-DO: this actually includes calling 'index' attribute overload, should really be df._index, + # but this requires separate type (e.g. DefaultIndexType) instead of types.none as default index + return df.index + + n = 11 + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(df_index=index): + df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}, index=index) + result = test_impl(df) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_index_create_df_with_index(self): + @self.jit + def test_impl(A, B, index): + df = pd.DataFrame({'A': A, 'B': B}, index=index) + return df.index + + n = 11 + A, B = np.ones(n), np.arange(n) + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(df_index=index): + result = test_impl(A, B, index) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_index_box_df_with_index(self): + def test_impl(A, B, index): + return pd.DataFrame({'A': A, 'B': B}, index=index) + sdc_func = self.jit(test_impl) + + n = 11 + A, B = np.ones(n), np.arange(n, dtype=np.intp) + for index in _generate_index_param_values(n): + with self.subTest(df_index=index): + result = sdc_func(A, B, index) + result_ref = test_impl(A, B, index) + pd.testing.assert_frame_equal(result, result_ref) + + def test_indexes_index_get_df_index(self): + def test_impl(df): + return df.index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_index_param_values(n): + with self.subTest(df_index=index): + df = pd.DataFrame({'A': np.ones(n)}, index=index) + result = sdc_func(df) + result_ref = test_impl(df) + self.assert_indexes_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/indexes/test_int64_index.py b/sdc/tests/indexes/test_int64_index.py new file mode 100644 index 000000000..875d6e6dc --- /dev/null +++ b/sdc/tests/indexes/test_int64_index.py @@ -0,0 +1,583 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +import unittest +from itertools import (combinations_with_replacement, product, ) + +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_valid_int64_index_data, + _generate_int64_indexes_fixed, + ) +from sdc.tests.test_base import TestCase + + +class TestInt64Index(TestCase): + + def test_int64_index_create_and_box(self): + def test_impl(data, name): + return pd.Int64Index(data, name=name) + sdc_func = self.jit(test_impl) + + name = 'index' + for data in _generate_valid_int64_index_data(): + with self.subTest(index_data=data): + result = sdc_func(data, name) + result_ref = test_impl(data, name) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_unbox_and_box(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_int64_indexes_fixed(n): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_create_param_copy_true(self): + def test_impl(arr): + return pd.Int64Index(arr, copy=True) + sdc_func = self.jit(test_impl) + + index_data_to_test = [ + np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64), + list(np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64)), + pd.RangeIndex(11), + pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), + ] + + for index_data in index_data_to_test: + with self.subTest(index_data=index_data): + result = sdc_func(index_data) + result_ref = test_impl(index_data) + pd.testing.assert_index_equal(result, result_ref) + self.assertEqual(result._data is result_ref._data, False) + + def test_int64_index_create_param_copy_default(self): + def test_impl(arr): + return pd.Int64Index(arr) + sdc_func = self.jit(test_impl) + + # only test data that has underlying array that can be referenced + # and ensure it has int64 dtype as otherwise there will always be a copy + index_data_to_test = [ + np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64), + pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), + ] + + for index_data in index_data_to_test: + with self.subTest(index_data=index_data): + result = sdc_func(index_data) + result_ref = test_impl(index_data) + pd.testing.assert_index_equal(result, result_ref) + self.assertEqual(result._data is result_ref._data, True) + + def test_int64_index_create_param_dtype(self): + def test_impl(n, dtype): + return pd.Int64Index(np.arange(n), dtype=dtype) + sdc_func = self.jit(test_impl) + + n = 11 + supported_dtypes = [None, np.int64, 'int64', np.int32, 'int32'] + for dtype in supported_dtypes: + with self.subTest(dtype=dtype): + result = sdc_func(n, dtype) + result_ref = test_impl(n, dtype) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_create_param_dtype_invalid(self): + def test_impl(n, dtype): + return pd.Int64Index(np.arange(n), dtype=dtype) + sdc_func = self.jit(test_impl) + + n = 11 + invalid_dtypes = ['float', 'uint'] + for dtype in invalid_dtypes: + with self.subTest(dtype=dtype): + with self.assertRaises(Exception) as context: + test_impl(n, dtype) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(n, dtype) + sdc_exception = context.exception + self.assertIn(str(sdc_exception), str(pandas_exception)) + + def test_int64_index_attribute_dtype(self): + def test_impl(index): + return index.dtype + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_attribute_name(self): + def test_impl(index): + return index.name + sdc_func = self.jit(test_impl) + + n = 11 + index_data = np.arange(n) * 2 + for name in test_global_index_names: + with self.subTest(name=name): + index = pd.Int64Index(index_data, name=name) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_len(self): + def test_impl(index): + return len(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2, name='index') + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_attribute_values(self): + def test_impl(index): + return index.values + sdc_func = self.jit(test_impl) + + for data in _generate_valid_int64_index_data(): + index = pd.Int64Index(data) + with self.subTest(index_data=data): + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_contains(self): + def test_impl(index, value): + return value in index + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([1, 11, 2]) + values_to_test = [-5, 15, 1, 11, 5, 6] + for value in values_to_test: + with self.subTest(value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_copy(self): + def test_impl(index, new_name): + return index.copy(name=new_name) + sdc_func = self.jit(test_impl) + + for data in _generate_valid_int64_index_data(): + for name, new_name in product(test_global_index_names, repeat=2): + index = pd.Int64Index(data, name=name) + with self.subTest(index=index, new_name=new_name): + result = sdc_func(index, new_name) + result_ref = test_impl(index, new_name) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_copy_param_deep(self): + def test_impl(index, deep): + return index.copy(deep=deep) + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([1, 11, 2]) + for deep in [True, False]: + with self.subTest(deep=deep): + result = sdc_func(index, deep) + result_ref = test_impl(index, deep) + pd.testing.assert_index_equal(result, result_ref) + # pandas uses ndarray views when copies index, so for python + # case check that data arrays share the same memory + self.assertEqual( + result._data is index._data, + result_ref._data.base is index._data + ) + + def test_int64_index_getitem_scalar(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + for data in _generate_valid_int64_index_data(): + index = pd.Int64Index(data) + n = len(index) + values_to_test = [-n, n // 2, n - 1] + for idx in values_to_test: + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + self.assertEqual(result, result_ref) + + def test_int64_index_getitem_scalar_idx_bounds(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2, name='abc') + values_to_test = [-(n + 1), n] + for idx in values_to_test: + with self.subTest(idx=idx): + with self.assertRaises(Exception) as context: + test_impl(index, idx) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index, idx) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn("out of bounds", str(sdc_exception)) + + def test_int64_index_getitem_slice(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + index_len = 11 + slices_params = combinations_with_replacement( + [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], + 3 + ) + + for data in _generate_valid_int64_index_data(): + for slice_start, slice_stop, slice_step in slices_params: + # slice step cannot be zero + if slice_step == 0: + continue + + idx = slice(slice_start, slice_stop, slice_step) + index = pd.Int64Index(data, name='abc') + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_iterator_1(self): + def test_impl(index): + res = [] + for i, label in enumerate(index): + res.append((i, label)) + return res + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([5, 3, 2, 1, 7, 4]) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_iterator_2(self): + def test_impl(index): + res = [] + for label in index: + if not label % 2: + res.append(label) + return res + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([5, 3, 2, 1, 7, 4]) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_nparray(self): + def test_impl(index): + return np.array(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_eq_index(self): + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_int64_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_eq_scalar(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + A = pd.Int64Index(np.arange(n) * 2) + scalars_to_test = [0, 22, 13, -5, 4.0] + for B in scalars_to_test: + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_eq_nparray(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + for A, B in product( + _generate_int64_indexes_fixed(n), + map(lambda x: np.array(x), _generate_int64_indexes_fixed(n)) + ): + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_ne_index(self): + def test_impl(index1, index2): + return index1 != index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_int64_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_is_nounbox(self): + def test_impl_1(data): + index1 = pd.Int64Index(data) + index2 = index1 + return index1 is index2 + sdc_func_1 = self.jit(test_impl_1) + + def test_impl_2(data): + index1 = pd.Int64Index(data) + index2 = pd.Int64Index(data) + return index1 is index2 + sdc_func_2 = self.jit(test_impl_2) + + # positive testcase + index_data = [1, 2, 3, 5, 6, 3, 4] + with self.subTest(subtest="same indexes"): + result = sdc_func_1(index_data) + result_ref = test_impl_1(index_data) + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + result = sdc_func_2(index_data) + result_ref = test_impl_2(index_data) + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_int64_index_getitem_by_mask(self): + def test_impl(index, mask): + return index[mask] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + mask = np.random.choice([True, False], n) + for index in _generate_int64_indexes_fixed(n): + result = sdc_func(index, mask) + result_ref = test_impl(index, mask) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_support_reindexing(self): + from sdc.datatypes.common_functions import sdc_reindex_series + + def pyfunc(data, index, name, by_index): + S = pd.Series(data, index, name=name) + return S.reindex(by_index) + + @self.jit + def sdc_func(data, index, name, by_index): + return sdc_reindex_series(data, index, name, by_index) + + n = 10 + np.random.seed(0) + mask = np.random.choice([True, False], n) + name = 'asdf' + index1 = pd.Int64Index(np.arange(n)) + index2 = pd.Int64Index(np.arange(n))[::-1] + result = sdc_func(mask, index1, name, index2) + result_ref = pyfunc(mask, index1, name, index2) + pd.testing.assert_series_equal(result, result_ref) + + def test_int64_index_support_join(self): + from sdc.datatypes.common_functions import sdc_join_series_indexes + + def pyfunc(index1, index2): + return index1.join(index2, how='outer', return_indexers=True) + + @self.jit + def sdc_func(index1, index2): + return sdc_join_series_indexes(index1, index2) + + index1 = pd.Int64Index(np.arange(-5, 5, 1), name='asv') + index2 = pd.Int64Index(np.arange(0, 10, 2), name='df') + result = sdc_func(index1, index2) + result_ref = pyfunc(index1, index2) + results_names = ['result index', 'left indexer', 'right indexer'] + for i, name in enumerate(results_names): + result_elem = result[i] + result_ref_elem = result_ref[i].values if not i else result_ref[i] + np.testing.assert_array_equal(result_elem, result_ref_elem, f"Mismatch in {name}") + + def test_int64_index_support_take_from(self): + from sdc.datatypes.common_functions import _sdc_take + + def pyfunc(index1, indexes): + return index1.values.take(indexes) + + @self.jit + def sdc_func(index1, indexes): + return _sdc_take(index1, indexes) + + n, k = 1000, 200 + np.random.seed(0) + index = pd.Int64Index(np.arange(n) * 2, name='asd') + indexes = np.random.choice(np.arange(n), n)[:k] + result = sdc_func(index, indexes) + result_ref = pyfunc(index, indexes) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_support_take_by(self): + from sdc.datatypes.common_functions import _sdc_take + + def pyfunc(arr, index): + return np.take(arr, index) + + @self.jit + def sdc_func(arr, index): + return _sdc_take(arr, index) + + n, k = 1000, 200 + np.random.seed(0) + arr = np.arange(n) * 2 + index = pd.Int64Index(np.random.choice(np.arange(n), n)[:k]) + result = sdc_func(arr, index) + result_ref = pyfunc(arr, index) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_support_astype(self): + from sdc.functions.numpy_like import astype + + def pyfunc(index): + return index.values.astype(np.int64) + + @self.jit + def sdc_func(index): + return astype(index, np.int64) + + n = 100 + index = pd.Int64Index(np.arange(n) * 2, name='asd') + np.testing.assert_array_equal(sdc_func(index), pyfunc(index)) + + def test_int64_index_support_array_equal(self): + from sdc.functions.numpy_like import array_equal + + def pyfunc(index1, index2): + return np.array_equal(index1.values, index2.values) + + @self.jit + def sdc_func(index1, index2): + return array_equal(index1, index2) + + n = 11 + indexes_to_test = [ + pd.Int64Index(np.arange(n)), + pd.Int64Index(np.arange(n), name='asd'), + pd.Int64Index(np.arange(n) * 2, name='asd'), + pd.Int64Index(np.arange(2 * n)), + ] + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = pyfunc(index1, index2) + self.assertEqual(result, result_ref) + + def test_int64_index_support_copy(self): + from sdc.functions.numpy_like import copy + + @self.jit + def sdc_func(index): + return copy(index) + + for data in _generate_valid_int64_index_data(): + for name in test_global_index_names: + index = pd.Int64Index(data, name=name) + with self.subTest(index=index): + result = sdc_func(index) + pd.testing.assert_index_equal(result, index) + + def test_int64_index_support_append(self): + from sdc.datatypes.common_functions import hpat_arrays_append + + def pyfunc(index1, index2): + return index1.append(index2) + + @self.jit + def sdc_func(index1, index2): + return hpat_arrays_append(index1, index2) + + n = 11 + index1 = pd.Int64Index(np.arange(n), name='asv') + index2 = pd.Int64Index(2 * np.arange(n), name='df') + result = sdc_func(index1, index2) + result_ref = pyfunc(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/test_indexes.py b/sdc/tests/indexes/test_range_index.py similarity index 74% rename from sdc/tests/test_indexes.py rename to sdc/tests/indexes/test_range_index.py index b277ac9a1..e9369c9f7 100644 --- a/sdc/tests/test_indexes.py +++ b/sdc/tests/indexes/test_range_index.py @@ -28,43 +28,18 @@ import numpy as np import pandas as pd import unittest +from itertools import (combinations_with_replacement, product, ) -from itertools import (combinations_with_replacement, product, filterfalse, chain) - +from numba.core.errors import TypingError +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_valid_range_params, + _generate_range_indexes_fixed, + _generate_index_param_values, + ) from sdc.tests.test_base import TestCase from sdc.utilities.sdc_typing_utils import kwsparams2list from sdc.tests.test_series import _make_func_from_text -from numba.core.errors import TypingError - - -test_global_index_names = [None, 'abc', 'index'] -test_global_range_member_values = [1, 2, 10, -5, 0, None] - - -def _generate_valid_range_params(): - - def valid_params_predicate(range_params): - # if step is zero or all start/stop/step are None range is invalid - return (range_params[-1] == 0 - or all(map(lambda x: x is None, range_params))) - - return filterfalse( - valid_params_predicate, - combinations_with_replacement(test_global_range_member_values, 3) - ) - - -def _generate_range_indexes_fixed(size, start=1, step=3): - yield pd.RangeIndex(size) - yield pd.RangeIndex(size, name='abc') - yield pd.RangeIndex(stop=step * size, step=step) - yield pd.RangeIndex(stop=2*step*size, step=2*step) - yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) - yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) - - -def _generate_index_param_values(n): - return chain([None], _generate_range_indexes_fixed(n)) class TestRangeIndex(TestCase): @@ -96,18 +71,6 @@ def test_impl(index): result_ref = test_impl(index) pd.testing.assert_index_equal(result, result_ref) - @unittest.skip("TODO: support boxing/unboxing and parent ref for Python ranges in Numba") - def test_range_index_unbox_data_id_check(self): - def test_impl(index): - return index - sdc_func = self.jit(test_impl) - - index = pd.RangeIndex(11, name='abc') - result = sdc_func(index) - result_ref = test_impl(index) - self.assertIs(index._range, result_ref._range) - self.assertIs(result._range, result_ref._range) - @unittest.skip("TODO: add support for integers as floats in ctor") def test_range_index_create_from_floats(self): def test_impl(*args): @@ -119,7 +82,7 @@ def test_impl(*args): result_ref = test_impl(start, stop, step) pd.testing.assert_index_equal(result, result_ref) - def test_range_index_create_invalid1(self): + def test_range_index_create_invalid_1(self): def test_impl(start, stop, step): return pd.RangeIndex(start, stop, step) sdc_func = self.jit(test_impl) @@ -135,7 +98,7 @@ def test_impl(start, stop, step): sdc_exception = context.exception self.assertIn(str(sdc_exception), str(pandas_exception)) - def test_range_index_create_invalid2(self): + def test_range_index_create_invalid_2(self): def test_impl(): return pd.RangeIndex(name='index') sdc_func = self.jit(test_impl) @@ -393,150 +356,6 @@ def test_impl(index, idx): result_ref = test_impl(index, idx) pd.testing.assert_index_equal(result, result_ref) - @unittest.skip("Needs writable native struct type members in Numba") - def test_range_index_named_set_name(self): - def test_impl(index): - index.name = 'def' - return index - sdc_func = self.jit(test_impl) - - n = 11 - index1 = pd.RangeIndex(n, name='abc') - index2 = index1.copy(deep=True) - result = sdc_func(index1) - result_ref = test_impl(index2) - pd.testing.assert_index_equal(result, result_ref) - - @unittest.skip("Needs writable native struct type members and single common type for name") - def test_range_index_unnamed_set_name(self): - def test_impl(index): - index.name = 'def' - return index - sdc_func = self.jit(test_impl) - - n = 11 - index1 = pd.RangeIndex(n, name='abc') - index2 = index1.copy(deep=True) - result = sdc_func(index1) - result_ref = test_impl(index2) - pd.testing.assert_index_equal(result, result_ref) - - def _test_range_indexes(self, test_impl, indexes, size, apply_func): - for index in indexes: - expected_res = pd.RangeIndex(size) if index is None else index - with self.subTest(series_index=index): - args = apply_func(size, index) - result = test_impl(args) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_unbox_series_with_index(self): - @self.jit - def test_impl(S): - # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, - # but this requires separate type (e.g. DefaultIndexType) instead of types.none as native index - return S.index - - n = 11 - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(series_index=index): - S = pd.Series(np.ones(n), index=index) - result = test_impl(S) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_create_series_with_index(self): - @self.jit - def test_impl(data, index): - S = pd.Series(data=data, index=index) - return S.index - - n = 11 - series_data = np.ones(n) - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(series_index=index): - result = test_impl(series_data, index) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_box_series_with_index(self): - def test_impl(data, index): - return pd.Series(data=data, index=index) - sdc_func = self.jit(test_impl) - - n = 11 - series_data = np.ones(n) - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - result = sdc_func(series_data, index) - result_ref = test_impl(series_data, index) - pd.testing.assert_series_equal(result, result_ref) - - def test_range_index_get_series_index(self): - def test_impl(S): - return S.index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - S = pd.Series(np.ones(n), index=index) - result = sdc_func(S) - result_ref = test_impl(S) - pd.testing.assert_index_equal(result, result_ref) - - def test_range_index_unbox_df_with_index(self): - @self.jit - def test_impl(df): - return df.index - - n = 11 - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(df_index=index): - df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}, index=index) - result = test_impl(df) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_create_df_with_index(self): - @self.jit - def test_impl(A, B, index): - df = pd.DataFrame({'A': A, 'B': B}, index=index) - return df.index - - n = 11 - A, B = np.ones(n), np.arange(n) - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(df_index=index): - result = test_impl(A, B, index) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_box_df_with_index(self): - def test_impl(A, B, index): - return pd.DataFrame({'A': A, 'B': B}, index=index) - sdc_func = self.jit(test_impl) - - n = 11 - A, B = np.ones(n), np.arange(n, dtype=np.intp) - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - result = sdc_func(A, B, index) - result_ref = test_impl(A, B, index) - pd.testing.assert_frame_equal(result, result_ref) - - def test_range_index_get_df_index(self): - def test_impl(df): - return df.index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - df = pd.DataFrame({'A': np.ones(n)}, index=index) - result = sdc_func(df) - result_ref = test_impl(df) - pd.testing.assert_index_equal(result, result_ref) - def test_range_index_iterator_1(self): def test_impl(index): res = [] @@ -660,29 +479,7 @@ def test_impl(index1, index2): result_ref = test_impl(index1, index2) np.testing.assert_array_equal(result, result_ref) - @unittest.skip("Need support unboxing Python range in Numba with parent ref") - def test_range_index_operator_is_1(self): - def test_impl(index1, index2): - return index1 is index2 - sdc_func = self.jit(test_impl) - - # positive testcase - with self.subTest(subtest="same indexes"): - index1 = pd.RangeIndex(1, 21, 3) - index2 = index1 - result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) - self.assertEqual(result, result_ref) - - # negative testcase - with self.subTest(subtest="not same indexes"): - index1 = pd.RangeIndex(1, 21, 3) - index2 = pd.RangeIndex(1, 21, 3) - result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) - self.assertEqual(result, result_ref) - - def test_range_index_operator_is_2(self): + def test_range_index_operator_is_nounbox(self): def test_impl_1(*args): index1 = pd.RangeIndex(*args) index2 = index1 @@ -701,12 +498,14 @@ def test_impl_2(*args): result = sdc_func_1(*params) result_ref = test_impl_1(*params) self.assertEqual(result, result_ref) + self.assertEqual(result, True) # negative testcase with self.subTest(subtest="not same indexes"): result = sdc_func_2(*params) result_ref = test_impl_2(*params) self.assertEqual(result, result_ref) + self.assertEqual(result, False) def test_range_index_getitem_by_mask(self): def test_impl(index, mask): @@ -719,8 +518,7 @@ def test_impl(index, mask): for index in _generate_range_indexes_fixed(n): result = sdc_func(index, mask) result_ref = test_impl(index, mask) - # FIXME: replace with pd.testing.assert_index_equal when Int64Index is supported - np.testing.assert_array_equal(result, result_ref.values) + pd.testing.assert_index_equal(result, result_ref) def test_range_index_support_reindexing(self): from sdc.datatypes.common_functions import sdc_reindex_series @@ -827,6 +625,34 @@ def sdc_func(index): result = sdc_func(index) pd.testing.assert_index_equal(result, index) + def test_range_index_support_append(self): + from sdc.datatypes.common_functions import hpat_arrays_append + + def pyfunc(index1, index2): + return index1.append(index2) + + @self.jit + def sdc_func(index1, index2): + return hpat_arrays_append(index1, index2) + + n = 11 + index1 = pd.RangeIndex(1, 21, 3, name='asv') + index2 = pd.RangeIndex(19, -1, -3, name='df') + result = sdc_func(index1, index2) + result_ref = pyfunc(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_range_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.RangeIndex(n) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + if __name__ == "__main__": unittest.main() diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index c245f0694..aa90c7f23 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -362,6 +362,30 @@ def test_impl(n): self.assertEqual(count_parfor_REPs(), 0) self.assertEqual(count_parfor_OneDs(), 1) + @unittest.skip("Works, but compile time needs debug") + def test_column_getitem_repeats(self): + def test_impl(a, b, c): + df = pd.DataFrame({ + 'A': a, + 'B': b, + 'C': c, + }) + + A = df['A'] + B = df['B'] + C = df['C'] + return A[0] + B[0] + C[0] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + a = np.ones(n) + b = np.random.ranf(n) + c = np.random.randint(-100, 100, n) + result = sdc_func(a, b, c) + result_ref = pd.Series(test_impl(a, b, c)) + pd.testing.assert_series_equal(result, result_ref) + @skip_numba_jit def test_column_list_getitem1(self): def test_impl(df): diff --git a/sdc/tests/test_date.py b/sdc/tests/test_date.py index 83671cee4..83d001349 100644 --- a/sdc/tests/test_date.py +++ b/sdc/tests/test_date.py @@ -81,7 +81,9 @@ def test_impl(A): hpat_func = self.jit(test_impl) df = self._gen_str_date_df() A = pd.DatetimeIndex(df['str_date']).to_series() - np.testing.assert_array_equal(hpat_func(A), test_impl(A)) + result = hpat_func(A) + result_ref = test_impl(A) + np.testing.assert_array_equal(result, result_ref) @skip_numba_jit def test_datetime_getitem(self): diff --git a/sdc/tests/test_hpat_jit.py b/sdc/tests/test_hpat_jit.py index 5e47f6f8e..8644551c3 100644 --- a/sdc/tests/test_hpat_jit.py +++ b/sdc/tests/test_hpat_jit.py @@ -36,6 +36,7 @@ from sdc import * from sdc.tests.test_base import TestCase from sdc.tests.test_utils import skip_numba_jit +from numba.experimental import jitclass class TestHpatJitIssues(TestCase): diff --git a/sdc/tests/test_rolling.py b/sdc/tests/test_rolling.py index 44b50370c..27f625b42 100644 --- a/sdc/tests/test_rolling.py +++ b/sdc/tests/test_rolling.py @@ -1149,7 +1149,6 @@ def test_impl(df, other, pairwise): hpat_func(df, other, True) self.assertIn(msg_tmpl.format('False, None'), str(raises.exception)) - @unittest.expectedFailure def test_df_rolling_cov_issue_floating_point_rounding(self): """ Cover issue of different float rounding in Python and SDC/Numba: diff --git a/sdc/tests/test_sdc_numpy.py b/sdc/tests/test_sdc_numpy.py index d6bda23db..9b7a74846 100644 --- a/sdc/tests/test_sdc_numpy.py +++ b/sdc/tests/test_sdc_numpy.py @@ -387,6 +387,45 @@ def run_test(ref_impl, sdc_impl, data, kind): with self.subTest(data=case, kind=kind, size=len(int_array)): run_test(ref_impl, sdc_func, data, kind) + def test_argsort_param_ascending(self): + + def ref_impl(a, kind, ascending): + return pd.Series(a).sort_values(kind=kind, ascending=ascending).index + + def sdc_impl(a, kind, ascending): + return numpy_like.argsort(a, kind=kind, ascending=ascending) + + def run_test(ref_impl, sdc_impl, data, kind, ascending): + if kind == 'mergesort': + np.testing.assert_array_equal( + ref_impl(data, kind, ascending), + sdc_func(data, kind, ascending)) + else: + sorted_ref = data[ref_impl(data, kind, ascending)] + sorted_sdc = data[sdc_impl(data, kind, ascending)] + np.testing.assert_array_equal(sorted_ref, sorted_sdc) + + sdc_func = self.jit(sdc_impl) + + n = 100 + np.random.seed(0) + data_values = { + 'float': [np.inf, np.NINF, np.nan, 0, -1, 2.1, 2/3, -3/4, 0.777], + 'int': [1, -1, 3, 5, -60, 21, 22, 23], + } + all_dtypes = { + 'float': ['float32', 'float64'], + 'int': ['int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64'] + } + + for kind, ascending in product([None, 'quicksort', 'mergesort'], [True, False]): + for dtype_group, arr_values in data_values.items(): + for dtype in all_dtypes[dtype_group]: + data = np.random.choice(arr_values, n).astype(dtype) + with self.subTest(data=data, kind=kind, ascending=ascending): + run_test(ref_impl, sdc_func, data, kind, ascending) + + def _test_fillna_numeric(self, pyfunc, cfunc, inplace): data_to_test = [ [True, False, False, True, True], diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 3c5db9c1f..ec829aa60 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -33,7 +33,7 @@ import sdc import string import unittest -from itertools import combinations, combinations_with_replacement, product +from itertools import combinations, combinations_with_replacement, islice, permutations, product import numba from numba import types from numba.core.config import IS_32BITS @@ -323,24 +323,7 @@ def test_impl(n): n = 11 pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) - def test_create_series_index1(self): - # create and box an indexed Series - def test_impl(): - A = pd.Series([1, 2, 3], ['A', 'C', 'B']) - return A - hpat_func = self.jit(test_impl) - - pd.testing.assert_series_equal(hpat_func(), test_impl()) - - def test_create_series_index2(self): - def test_impl(): - A = pd.Series([1, 2, 3], index=[2, 1, 0]) - return A - hpat_func = self.jit(test_impl) - - pd.testing.assert_series_equal(hpat_func(), test_impl()) - - def test_create_series_index3(self): + def test_create_series_param_name_literal(self): def test_impl(): A = pd.Series([1, 2, 3], index=['A', 'C', 'B'], name='A') return A @@ -348,7 +331,7 @@ def test_impl(): pd.testing.assert_series_equal(hpat_func(), test_impl()) - def test_create_series_index4(self): + def test_create_series_param_name(self): def test_impl(name): A = pd.Series([1, 2, 3], index=['A', 'C', 'B'], name=name) return A @@ -376,7 +359,7 @@ def test_impl(A): S = pd.Series(['a', 'b', 'c'], name='A') self.assertEqual(hpat_func(S), test_impl(S)) - def test_pass_series_index1(self): + def test_pass_series_all_indexes(self): def test_impl(A): return A hpat_func = self.jit(test_impl) @@ -387,6 +370,7 @@ def test_impl(A): list(np.arange(n)), np.arange(n), pd.RangeIndex(n), + pd.Int64Index(np.arange(n)), gen_strlist(n) ] for index in indexes_to_test: @@ -2154,16 +2138,26 @@ def test_series_value_counts_numeric_dropna_false(self): def test_impl(S): return S.value_counts(dropna=False) - data_to_test = [[1, 2, 3, 1, 1, 3], - [1, 2, 3, np.nan, 1, 3, np.nan, np.inf], - [0.1, 3., np.nan, 3., 0.1, 3., np.nan, np.inf, 0.1, 0.1]] + data_to_test = [ + [1, 2, 3, 1, 1, 3], + [1, 2, 3, np.nan, 1, 3, np.nan, np.inf], + [0.1, 3., np.nan, 3., 0.1, 3., np.nan, np.inf, 0.1, 0.1] + ] hpat_func = self.jit(test_impl) for data in data_to_test: with self.subTest(series_data=data): S = pd.Series(data) - pd.testing.assert_series_equal(hpat_func(S), test_impl(S)) + result = hpat_func(S) + result_ref = test_impl(S) + + # order within groups of same counts may be different since + # pandas impl uses sort_values() with default kind='quicksort' + pd.testing.assert_series_equal( + result.sort_index(), + result_ref.sort_index() + ) def test_series_value_counts_str_dropna_false(self): def test_impl(S): @@ -2206,13 +2200,15 @@ def test_series_value_counts_index(self): def test_impl(S): return S.value_counts() - hpat_func = self.jit(test_impl) + sdc_func = self.jit(test_impl) for data in test_global_input_data_integer64: + index = np.arange(start=1, stop=len(data) + 1) with self.subTest(series_data=data): - index = np.arange(start=1, stop=len(data) + 1) S = pd.Series(data, index=index) - pd.testing.assert_series_equal(hpat_func(S).sort_index(), test_impl(S).sort_index()) + result = sdc_func(S) + result_ref = test_impl(S) + pd.testing.assert_series_equal(result.sort_index(), result_ref.sort_index()) def test_series_value_counts_no_unboxing(self): def test_impl(): @@ -4037,17 +4033,17 @@ def test_impl(series, ascending, kind): for data in all_data: series = pd.Series(data * 3) - for ascending in [True, False]: - for kind in ['quicksort', 'mergesort']: - ref_result = test_impl(series, ascending, kind=kind) - jit_result = hpat_func(series, ascending, kind=kind) - ref = restore_series_sort_values(series, ref_result.index, ascending) - jit = restore_series_sort_values(series, jit_result.index, ascending) + for ascending, kind in product([True, False], ['quicksort', 'mergesort']): + with self.subTest(data=data, ascending=ascending, kind=kind): + result = hpat_func(series, ascending, kind=kind) + result_ref = test_impl(series, ascending, kind=kind) if kind == 'mergesort': - pd.testing.assert_series_equal(ref_result, jit_result) + pd.testing.assert_series_equal(result, result_ref) else: - np.testing.assert_array_equal(ref_result.values, jit_result.values) - self.assertEqual(ref, jit) + np.testing.assert_array_equal(result.values, result_ref.values) + jit = restore_series_sort_values(series, result.index, ascending) + ref = restore_series_sort_values(series, result_ref.index, ascending) + self.assertEqual(jit, ref) @skip_parallel def test_series_sort_values_full_idx(self): @@ -5330,6 +5326,7 @@ def test_impl(A, i, value): test_impl(S2, idx, value) pd.testing.assert_series_equal(S1, S2) + @unittest.expectedFailure # FIXME_Pandas#37427 (since pandas=1.1 setitem does diff things for diff dtypes) def test_series_setitem_idx_str_series(self): """ Verifies Series.setitem for idx operand of type pandas.Series and string dtype called on integer Series with index of matching dtype and scalar and non scalar assigned values """ @@ -5345,6 +5342,7 @@ def test_series_setitem_idx_str_series(self): pd.Series(assigned_values)] self._test_series_setitem([series_data], [series_index], [idx], values_to_test, np.intp) + @unittest.expectedFailure # FIXME_Pandas#37427 (since pandas=1.1 setitem does diff things for diff dtypes) def test_series_setitem_idx_float_series(self): """ Verifies Series.setitem for idx operand of type pandas.Series and float dtype called on integer Series with index of matching dtype and scalar and non scalar assigned values """ diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 81bc81c31..1d489d17f 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -40,6 +40,17 @@ from sdc.str_arr_type import string_array_type from sdc.datatypes.range_index_type import RangeIndexType +from sdc.datatypes.int64_index_type import Int64IndexType +from sdc.str_arr_ext import StringArrayType + + +sdc_pandas_index_types = ( + types.NoneType, + types.Array, + StringArrayType, + RangeIndexType, + Int64IndexType, + ) class TypeChecker: @@ -138,7 +149,7 @@ def check_is_numeric_array(type_var): def check_index_is_numeric(ty_series): """Used during typing to check that series has numeric index""" - return check_is_numeric_array(ty_series.index) + return isinstance(ty_series.index.dtype, types.Number) def check_types_comparable(ty_left, ty_right): @@ -196,6 +207,7 @@ def find_index_common_dtype(self, other): return index_dtypes_match, numba_index_common_dtype + def gen_impl_generator(codegen, impl_name): """Generate generator of an implementation""" def _df_impl_generator(*args, **kwargs): @@ -208,3 +220,7 @@ def _df_impl_generator(*args, **kwargs): return _impl return _df_impl_generator + + +def check_signed_integer(ty): + return isinstance(ty, types.Integer) and ty.signed diff --git a/setup.py b/setup.py index d4c8f1fab..903725410 100644 --- a/setup.py +++ b/setup.py @@ -375,9 +375,9 @@ def run(self): package_data={'sdc.tests': ['*.bz2'], }, install_requires=[ 'numpy>=1.16', - 'pandas>=1.0', + 'pandas>=1.2.0', 'pyarrow==0.17.0', - 'numba>=0.51.2,<0.52', + 'numba>=0.52.0,<0.53', 'tbb' ], cmdclass=sdc_build_commands, From df6175fcf23d74fc015165c35e54f73146f7f96f Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Fri, 19 Feb 2021 17:35:58 +0300 Subject: [PATCH 3/4] Revert "Merge from master 2021_w8 (#962)" (#963) This reverts commit 5ce38417dc3c7ba53a65b093d96888c1ec7008ca. --- README.rst | 8 +- buildscripts/utilities.py | 2 +- conda-recipe/meta.yaml | 4 +- .../_api_ref.pandas.window_templ.rst | 10 +- docs/source/getting_started.rst | 4 +- requirements.txt | 4 +- sdc/__init__.py | 3 +- sdc/_str_ext.cpp | 34 - sdc/datatypes/common_functions.py | 101 ++- .../hpat_pandas_dataframe_functions.py | 3 +- sdc/datatypes/hpat_pandas_series_functions.py | 31 +- sdc/datatypes/int64_index_type.py | 65 -- sdc/extensions/indexes/indexes_generic.py | 40 -- sdc/extensions/indexes/int64_index_ext.py | 415 ------------- sdc/extensions/indexes/range_index_ext.py | 59 +- sdc/functions/numpy_like.py | 56 +- sdc/functions/sort.py | 20 +- sdc/hiframes/api.py | 11 +- sdc/hiframes/boxing.py | 23 +- sdc/hiframes/pd_series_ext.py | 1 + sdc/native/module.cpp | 44 +- sdc/native/sort.cpp | 12 +- sdc/native/stable_sort.cpp | 14 +- sdc/native/utils.cpp | 12 - sdc/native/utils.hpp | 21 - sdc/set_ext.py | 1 - sdc/str_arr_ext.py | 83 +-- sdc/tests/__init__.py | 2 +- sdc/tests/indexes/__init__.py | 29 - sdc/tests/indexes/index_datagens.py | 88 --- sdc/tests/indexes/test_indexes.py | 266 -------- sdc/tests/indexes/test_int64_index.py | 583 ------------------ sdc/tests/test_dataframe.py | 24 - sdc/tests/test_date.py | 4 +- sdc/tests/test_hpat_jit.py | 1 - .../test_range_index.py => test_indexes.py} | 258 ++++++-- sdc/tests/test_rolling.py | 1 + sdc/tests/test_sdc_numpy.py | 39 -- sdc/tests/test_series.py | 72 +-- sdc/utilities/sdc_typing_utils.py | 18 +- setup.py | 4 +- 41 files changed, 449 insertions(+), 2021 deletions(-) delete mode 100644 sdc/datatypes/int64_index_type.py delete mode 100644 sdc/extensions/indexes/indexes_generic.py delete mode 100644 sdc/extensions/indexes/int64_index_ext.py delete mode 100644 sdc/tests/indexes/__init__.py delete mode 100644 sdc/tests/indexes/index_datagens.py delete mode 100644 sdc/tests/indexes/test_indexes.py delete mode 100644 sdc/tests/indexes/test_int64_index.py rename sdc/tests/{indexes/test_range_index.py => test_indexes.py} (74%) diff --git a/README.rst b/README.rst index a5f18b8a9..de0dd0e90 100644 --- a/README.rst +++ b/README.rst @@ -34,13 +34,13 @@ Distribution includes Intel® SDC for Python 3.6 and Python 3.7 for Windows and Intel® SDC conda package can be installed using the steps below:: - > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.0.5 -c anaconda -c conda-forge > conda activate sdc-env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel® SDC wheel package can be installed using the steps below:: - > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.0.5 -c anaconda -c conda-forge > conda activate sdc-env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc @@ -82,7 +82,7 @@ Building on Linux with setuptools export PYVER=<3.6 or 3.7> export NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64 + conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.49 pandas=1.0.5 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64 source activate sdc-env git clone https://github.com/IntelPython/sdc.git cd sdc @@ -120,7 +120,7 @@ Building on Windows with setuptools set PYVER=<3.6 or 3.7> set NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 + conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.49 pandas=1.0.5 pyarrow=0.17.0 conda activate sdc-env set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include set LIB=%LIB%;%CONDA_PREFIX%\Library\lib diff --git a/buildscripts/utilities.py b/buildscripts/utilities.py index 440c64ec0..a1e1c0a90 100644 --- a/buildscripts/utilities.py +++ b/buildscripts/utilities.py @@ -52,7 +52,7 @@ def __init__(self, python, sdc_local_channel=None): self.line_single = '-'*80 # Set channels - self.channel_list = ['-c', 'defaults', '-c', 'conda-forge'] + self.channel_list = ['-c', 'intel/label/beta', '-c', 'defaults', '-c', 'conda-forge'] if sdc_local_channel: sdc_local_channel = Path(sdc_local_channel).resolve().as_uri() self.channel_list = ['-c', sdc_local_channel] + self.channel_list diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index bd95dbc9d..ed1084d14 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ -{% set NUMBA_VERSION = "==0.52.0" %} -{% set PANDAS_VERSION = "==1.2.0" %} +{% set NUMBA_VERSION = "==0.51.2" %} +{% set PANDAS_VERSION = "==1.0.5" %} {% set PYARROW_VERSION = "==0.17.0" %} package: diff --git a/docs/source/_templates/_api_ref.pandas.window_templ.rst b/docs/source/_templates/_api_ref.pandas.window_templ.rst index c4308cb9d..fbf6419cd 100644 --- a/docs/source/_templates/_api_ref.pandas.window_templ.rst +++ b/docs/source/_templates/_api_ref.pandas.window_templ.rst @@ -51,8 +51,8 @@ Exponentially-weighted moving window functions ---------------------------------------------- .. sdc_toctree - ewm.ExponentialMovingWindow.mean - ewm.ExponentialMovingWindow.std - ewm.ExponentialMovingWindow.var - ewm.ExponentialMovingWindow.corr - ewm.ExponentialMovingWindow.cov + EWM.mean + EWM.std + EWM.var + EWM.corr + EWM.cov diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index b0fcc0182..a8def2dd0 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla Intel SDC conda package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=0.25.3 -c anaconda -c conda-forge > conda activate sdc_env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel SDC wheel package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=0.25.3 -c anaconda -c conda-forge > conda activate sdc_env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc diff --git a/requirements.txt b/requirements.txt index f3016c49e..dbe156342 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy>=1.16 -pandas==1.2.0 +pandas==0.25.3 pyarrow==0.17.0 -numba==0.52.0 +numba==0.51.2 tbb tbb-devel diff --git a/sdc/__init__.py b/sdc/__init__.py index e9ca063dd..2a514b70a 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -28,7 +28,7 @@ # re-export from Numba from numba import (typeof, prange, pndindex, gdb, gdb_breakpoint, gdb_init, - stencil, threading_layer, objmode) + stencil, threading_layer, jitclass, objmode) import sdc.config import sdc.set_ext @@ -48,7 +48,6 @@ import sdc.datatypes.series.init import sdc.extensions.indexes.range_index_ext -import sdc.extensions.indexes.int64_index_ext from ._version import get_versions diff --git a/sdc/_str_ext.cpp b/sdc/_str_ext.cpp index b5e41ce17..304d449fb 100644 --- a/sdc/_str_ext.cpp +++ b/sdc/_str_ext.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include "_str_decode.cpp" @@ -130,7 +129,6 @@ extern "C" npy_intp array_size(PyArrayObject* arr); void* array_getptr1(PyArrayObject* arr, npy_intp ind); void array_setitem(PyArrayObject* arr, char* p, PyObject* s); - void stable_argsort(char* data_ptr, uint32_t* in_offsets, int64_t len, int8_t ascending, uint64_t* result); PyMODINIT_FUNC PyInit_hstr_ext(void) { @@ -203,7 +201,6 @@ extern "C" PyObject_SetAttrString(m, "array_setitem", PyLong_FromVoidPtr((void*)(&array_setitem))); PyObject_SetAttrString(m, "decode_utf8", PyLong_FromVoidPtr((void*)(&decode_utf8))); PyObject_SetAttrString(m, "get_utf8_size", PyLong_FromVoidPtr((void*)(&get_utf8_size))); - PyObject_SetAttrString(m, "stable_argsort", PyLong_FromVoidPtr((void*)(&stable_argsort))); return m; } @@ -874,35 +871,4 @@ extern "C" return; } - void stable_argsort(char* data_ptr, uint32_t* in_offsets, int64_t len, int8_t ascending, uint64_t* result) - { - using str_index_pair_type = std::pair; - std::vector str_arr_indexed; - str_arr_indexed.reserve(len); - - for (int64_t i=0; i < len; ++i) - { - uint32_t start = in_offsets[i]; - uint32_t size = in_offsets[i + 1] - in_offsets[i]; - str_arr_indexed.emplace_back( - std::move(std::string(&data_ptr[start], size)), - i - ); - } - - std::stable_sort(str_arr_indexed.begin(), - str_arr_indexed.end(), - [=](const str_index_pair_type& left, const str_index_pair_type& right){ - if (ascending) - return left.first < right.first; - else - return left.first > right.first; - } - ); - - for (int64_t i=0; i < len; ++i) - result[i] = str_arr_indexed[i].second; - } - - } // extern "C" diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 8924a99be..bffdc5b30 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -48,17 +48,14 @@ from sdc.functions import numpy_like from sdc.str_arr_type import string_array_type, StringArrayType from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.str_arr_ext import (num_total_chars, append_string_array_to, str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type, cp_str_list_to_array, create_str_arr_from_list, get_utf8_size, - str_arr_set_na_by_mask, str_arr_stable_argosort) + str_arr_set_na_by_mask) from sdc.utilities.prange_utils import parallel_chunks from sdc.utilities.utils import sdc_overload, sdc_register_jitable -from sdc.utilities.sdc_typing_utils import ( - find_common_dtype_from_numpy_dtypes, - TypeChecker) -from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types +from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes, + TypeChecker) class SDCLimitation(Exception): @@ -74,20 +71,18 @@ def hpat_arrays_append(A, B): def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" - use_A_array = isinstance(A, (RangeIndexType, Int64IndexType)) - use_B_array = isinstance(B, (RangeIndexType, Int64IndexType)) - if isinstance(A, (types.Array, RangeIndexType, Int64IndexType)): - if isinstance(B, (types.Array, RangeIndexType, Int64IndexType)): + A_is_range_index = isinstance(A, RangeIndexType) + B_is_range_index = isinstance(B, RangeIndexType) + if isinstance(A, (types.Array, RangeIndexType)): + if isinstance(B, (types.Array, RangeIndexType)): def _append_single_numeric_impl(A, B): - _A = A.values if use_A_array == True else A # noqa - _B = B.values if use_B_array == True else B # noqa + _A = A.values if A_is_range_index == True else A # noqa + _B = B.values if B_is_range_index == True else B # noqa return numpy.concatenate((_A, _B,)) return _append_single_numeric_impl - - elif (isinstance(B, (types.UniTuple, types.List)) - and isinstance(B.dtype, (types.Array, RangeIndexType, Int64IndexType))): - B_dtype_is_index = isinstance(B.dtype, (RangeIndexType, Int64IndexType)) + elif isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType)): + B_dtype_is_range_index = isinstance(B.dtype, RangeIndexType) numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime @@ -97,10 +92,10 @@ def _append_list_numeric_impl(A, B): new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) - _A = numpy.array(A) if use_A_array == True else A # noqa + _A = numpy.array(A) if A_is_range_index == True else A # noqa new_data[:stop] = _A for arr in B: - _arr = arr.values if B_dtype_is_index == True else arr # noqa + _arr = numpy.array(arr) if B_dtype_is_range_index == True else arr # noqa start = stop stop = start + len(_arr) new_data[start:stop] = _arr @@ -223,13 +218,12 @@ def sdc_join_series_indexes_overload(left, right): """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" # check that both operands are of types used for representing Pandas indexes - if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types) - and not isinstance(left, types.NoneType) - and not isinstance(right, types.NoneType)): + if not (isinstance(left, (types.Array, StringArrayType, RangeIndexType)) + and isinstance(right, (types.Array, StringArrayType, RangeIndexType))): return None - convert_left = isinstance(left, (RangeIndexType, Int64IndexType)) - convert_right = isinstance(right, (RangeIndexType, Int64IndexType)) + convert_left = isinstance(left, RangeIndexType) + convert_right = isinstance(right, RangeIndexType) def _convert_to_arrays_impl(left, right): _left = left.values if convert_left == True else left # noqa @@ -249,9 +243,10 @@ def sdc_join_range_indexes_impl(left, right): return sdc_join_range_indexes_impl - elif (isinstance(left, (RangeIndexType, Int64IndexType, types.Array)) - and isinstance(right, (RangeIndexType, Int64IndexType, types.Array)) - and not (isinstance(left, types.Array) and isinstance(right, types.Array))): + elif isinstance(left, RangeIndexType) and isinstance(right, types.Array): + return _convert_to_arrays_impl + + elif isinstance(left, types.Array) and isinstance(right, RangeIndexType): return _convert_to_arrays_impl # TODO: remove code duplication below and merge numeric and StringArray impls into one @@ -518,7 +513,7 @@ def sdc_arrays_argsort(A, kind='quicksort'): @sdc_overload(sdc_arrays_argsort, jit_options={'parallel': False}) -def sdc_arrays_argsort_overload(A, kind='quicksort', ascending=True): +def sdc_arrays_argsort_overload(A, kind='quicksort'): """Function providing pandas argsort implementation for different 1D array types""" # kind is not known at compile time, so get this function here and use in impl if needed @@ -526,31 +521,33 @@ def sdc_arrays_argsort_overload(A, kind='quicksort', ascending=True): kind_is_default = isinstance(kind, str) if isinstance(A, types.Array): - def _sdc_arrays_argsort_array_impl(A, kind='quicksort', ascending=True): + def _sdc_arrays_argsort_array_impl(A, kind='quicksort'): _kind = 'quicksort' if kind_is_default == True else kind # noqa - return numpy_like.argsort(A, kind=_kind, ascending=ascending) + return numpy_like.argsort(A, kind=_kind) return _sdc_arrays_argsort_array_impl elif A == string_array_type: - def _sdc_arrays_argsort_str_arr_impl(A, kind='quicksort', ascending=True): + def _sdc_arrays_argsort_str_arr_impl(A, kind='quicksort'): + nan_mask = sdc.hiframes.api.get_nan_mask(A) + idx = numpy.arange(len(A)) + old_nan_positions = idx[nan_mask] + + data = A[~nan_mask] + keys = idx[~nan_mask] if kind == 'quicksort': - indexes = numpy.arange(len(A)) - data_index_pairs = list(zip(list(A), list(indexes))) - zipped = quicksort_func(data_index_pairs) - argsorted = [zipped[i][1] for i in indexes] - res = numpy.array(argsorted, dtype=numpy.int64) - # for non-stable sort the order within groups does not matter - # so just reverse the result when sorting in descending order - if not ascending: - res = res[::-1] + zipped = list(zip(list(data), list(keys))) + zipped = quicksort_func(zipped) + argsorted = [zipped[i][1] for i in numpy.arange(len(data))] elif kind == 'mergesort': - res = str_arr_stable_argosort(A, ascending=ascending) + sdc.hiframes.sort.local_sort((data, ), (keys, )) + argsorted = list(keys) else: raise ValueError("Unrecognized kind of sort in sdc_arrays_argsort") - return res + argsorted.extend(old_nan_positions) + return numpy.asarray(argsorted, dtype=numpy.int32) return _sdc_arrays_argsort_str_arr_impl @@ -621,16 +618,13 @@ def _sdc_take(data, indexes): @sdc_overload(_sdc_take) def _sdc_take_overload(data, indexes): - valid_data_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(data, valid_data_types) and not isinstance(data, types.NoneType)): + if not isinstance(data, (types.Array, StringArrayType, RangeIndexType)): return None - - if not (isinstance(indexes, (types.Array, types.List, Int64IndexType)) + if not (isinstance(indexes, (types.Array, types.List)) and isinstance(indexes.dtype, (types.Integer, types.ListType))): return None - if (isinstance(indexes.dtype, types.ListType) - and isinstance(data, (types.Array, types.List, RangeIndexType, Int64IndexType))): + if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List, RangeIndexType)): arr_dtype = data.dtype def _sdc_take_list_impl(data, indexes): @@ -683,7 +677,7 @@ def _sdc_take_list_str_impl(data, indexes): return _sdc_take_list_str_impl - elif isinstance(data, (types.Array, RangeIndexType, Int64IndexType)): + elif isinstance(data, (types.Array, RangeIndexType)): arr_dtype = data.dtype def _sdc_take_array_impl(data, indexes): @@ -746,7 +740,6 @@ def sdc_reindex_series_overload(arr, index, name, by_index): """ Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """ range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType) - int64_indexes = isinstance(index, Int64IndexType) and isinstance(by_index, Int64IndexType) data_dtype, index_dtype = arr.dtype, index.dtype data_is_str_arr = isinstance(arr.dtype, types.UnicodeType) @@ -755,8 +748,6 @@ def sdc_reindex_series_impl(arr, index, name, by_index): # no reindexing is needed if indexes are equal if range_indexes == True: # noqa equal_indexes = numpy_like.array_equal(index, by_index) - elif int64_indexes == True: # noqa - equal_indexes = numpy_like.array_equal(index, by_index) else: equal_indexes = False if (index is by_index or equal_indexes): @@ -781,10 +772,10 @@ def sdc_reindex_series_impl(arr, index, name, by_index): map_index_to_position[value] = i index_mismatch = 0 - for i in numba.prange(len(by_index)): - val = by_index[i] - if val in map_index_to_position: - pos_in_self = map_index_to_position[val] + # FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used + for i in numpy.arange(len(by_index)): + if by_index[i] in map_index_to_position: + pos_in_self = map_index_to_position[by_index[i]] _res_data[i] = arr[pos_in_self] if data_is_str_arr == True: # noqa res_data_nan_mask[i] = isna(arr, i) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index de7edef66..31f3738d9 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -50,7 +50,6 @@ gen_impl_generator, find_common_dtype_from_numpy_dtypes) from sdc.str_arr_ext import StringArrayType from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.hiframes.pd_dataframe_type import DataFrameType from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps @@ -2258,7 +2257,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx): if accessor == 'at': num_idx = (isinstance(idx[0], types.Number) - and isinstance(self.dataframe.index, (types.NoneType, RangeIndexType, Int64IndexType))) + and isinstance(self.dataframe.index, (types.Array, types.NoneType, RangeIndexType))) str_idx = (isinstance(idx[0], (types.UnicodeType, types.StringLiteral)) and isinstance(self.dataframe.index, StringArrayType)) if isinstance(idx, types.Tuple) and isinstance(idx[1], types.StringLiteral): diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 1c18ba2e6..610a21fc7 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -53,7 +53,6 @@ find_common_dtype_from_numpy_dtypes, has_literal_value, has_python_value) from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.datatypes.common_functions import (sdc_join_series_indexes, sdc_arrays_argsort, sdc_reindex_series) from sdc.datatypes.hpat_pandas_rolling_types import ( gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl) @@ -72,7 +71,6 @@ from sdc.hiframes.api import isna from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby from sdc.utilities.prange_utils import parallel_chunks -from sdc.set_ext import build_set from .pandas_series_functions import apply from .pandas_series_functions import map as _map @@ -620,7 +618,7 @@ def sdc_pandas_series_setitem(self, idx, value): def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): if assign_via_idx_mask == True: # noqa - # FIXME_Numba#5157: using asarray since eq impl for index types returns list + # FIXME_Numba#5157: using asarray since eq impl for RangeIndexType returns list _idx = numpy.asarray(self._index == idx) elif assign_via_idx_data == True: # noqa _idx = idx._data @@ -653,7 +651,7 @@ def sdc_pandas_series_setitem_idx_bool_array_align_impl(self, idx, value): # and filtered indexes are looked in value.index, and if found corresponding value is set if value_is_series == True: # noqa value_index, self_index = value.index, self.index - unique_value_indices, unique_self_indices = build_set(value_index), build_set(self_index) + unique_value_indices, unique_self_indices = set(value_index), set(self_index) # pandas behaves differently if value.index has duplicates and if it has no # in case of duplicates in value.index assignment is made via positions @@ -703,7 +701,7 @@ def sdc_pandas_series_setitem_idx_bool_series_align_impl(self, idx, value): # and filtered indexes are either looked in value.index (if value is a Series) # or in self.index (if value is scalar or array) filtered_idx_indices = idx_index[idx._data] - filtered_idx_indices_set = build_set(filtered_idx_indices) + filtered_idx_indices_set = set(filtered_idx_indices) if value_is_series == True: # noqa if len(filtered_idx_indices_set) != len(filtered_idx_indices): @@ -776,7 +774,7 @@ def sdc_pandas_series_setitem_idx_int_series_align_impl(self, idx, value): raise ValueError("Reindexing only valid with uniquely valued Index objects") if len(valid_indices_masked) != idx_size: - raise KeyError("Reindexing not possible: idx has index not found in Series") + raise ValueError("Reindexing not possible: idx has index not found in Series") if value_is_scalar == True: # noqa self._data[valid_indices_positions] = _value @@ -810,7 +808,7 @@ def sdc_pandas_series_setitem_idx_str_series_align_impl(self, idx, value): set_positions[i] = map_index_to_position[index_value] if number_of_found != idx_data_size: - raise KeyError("Reindexing not possible: idx has index not found in Series") + raise ValueError("Reindexing not possible: idx has index not found in Series") if value_is_series == True: # noqa self._data[set_positions] = value._data @@ -2075,7 +2073,7 @@ def hpat_pandas_series_isin_impl(self, values): # return pandas.Series (np.isin (self._data, values)) values = str_list_to_array(list(values)) - values = build_set(values) + values = set(values) data_len = len(self._data) result = numpy.empty(data_len, dtype=numpy.bool_) for i in prange(data_len): @@ -2087,7 +2085,7 @@ def hpat_pandas_series_isin_impl(self, values): # TODO: replace with below line when Numba supports np.isin in nopython mode # return pandas.Series (np.isin (self._data, values)) - values = build_set(values) + values = set(values) data_len = len(self._data) result = numpy.empty(data_len, dtype=numpy.bool_) for i in prange(data_len): @@ -3448,7 +3446,7 @@ def hpat_pandas_series_unique_str_impl(self): Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_unique_str ''' - str_set = build_set(self._data) + str_set = set(self._data) return to_array(str_set) return hpat_pandas_series_unique_str_impl @@ -3580,7 +3578,7 @@ def hpat_pandas_series_nunique_str_impl(self, dropna=True): if dropna: nan_mask = self.isna() data = self._data[~nan_mask._data] - unique_values = build_set(data) + unique_values = set(data) return len(unique_values) return hpat_pandas_series_nunique_str_impl @@ -3593,7 +3591,7 @@ def hpat_pandas_series_nunique_impl(self, dropna=True): data_mask_for_nan = numpy.isnan(self._data) nan_exists = numpy.any(data_mask_for_nan) data_no_nan = self._data[~data_mask_for_nan] - data_set = build_set(data_no_nan) + data_set = set(data_no_nan) if dropna or not nan_exists: return len(data_set) else: @@ -3950,9 +3948,11 @@ def _sdc_pandas_series_sort_values_impl( good = ~data_nan_mask if kind_is_none_or_default == True: # noqa - argsort_res = sdc_arrays_argsort(self._data[good], kind='quicksort', ascending=ascending) + argsort_res = sdc_arrays_argsort(self._data[good], kind='quicksort') else: - argsort_res = sdc_arrays_argsort(self._data[good], kind=kind, ascending=ascending) + argsort_res = sdc_arrays_argsort(self._data[good], kind=kind) + if not ascending: + argsort_res = argsort_res[::-1] idx = numpy.arange(len(self), dtype=numpy.int32) sorted_index = numpy.empty(len(self), dtype=numpy.int32) @@ -4034,8 +4034,7 @@ def hpat_pandas_series_dropna(self, axis=0, inplace=False): ty_checker.raise_exc(inplace, 'bool', 'inplace') if (isinstance(self.data.dtype, types.Number) - and (isinstance(self.index, types.NoneType) - or isinstance(self.index.dtype, types.Number))): + and isinstance(self.index, (types.Number, types.NoneType, RangeIndexType))): def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False): index = self.index return numpy_like.dropna(self._data, index, self._name) diff --git a/sdc/datatypes/int64_index_type.py b/sdc/datatypes/int64_index_type.py deleted file mode 100644 index 745d394a7..000000000 --- a/sdc/datatypes/int64_index_type.py +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -from numba import types -from numba.extending import ( - models, - register_model, - make_attribute_wrapper -) - - -class Int64IndexType(types.IterableType): - dtype = types.int64 - - def __init__(self, data, is_named=False): - self.data = data - self.is_named = is_named - super(Int64IndexType, self).__init__( - name='Int64IndexType({}, {})'.format(data, is_named)) - - @property - def iterator_type(self): - res = self.data.iterator_type - return res - - -@register_model(Int64IndexType) -class Int64IndexModel(models.StructModel): - def __init__(self, dmm, fe_type): - - data_type = fe_type.data - name_type = types.unicode_type if fe_type.is_named else types.none - members = [ - ('data', data_type), - ('name', name_type), - ] - models.StructModel.__init__(self, dmm, fe_type, members) - - -make_attribute_wrapper(Int64IndexType, 'data', '_data') -make_attribute_wrapper(Int64IndexType, 'name', '_name') diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py deleted file mode 100644 index 397698565..000000000 --- a/sdc/extensions/indexes/indexes_generic.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2019-2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -import numba -import numpy as np -import pandas as pd - -from numba import types - - -def _check_dtype_param_type(dtype): - """ Returns True is dtype is a valid type for dtype parameter and False otherwise. - Used in RangeIndex ctor and other methods that take dtype parameter. """ - - valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) - return isinstance(dtype, valid_dtype_types) or dtype is None diff --git a/sdc/extensions/indexes/int64_index_ext.py b/sdc/extensions/indexes/int64_index_ext.py deleted file mode 100644 index 97db3fd4b..000000000 --- a/sdc/extensions/indexes/int64_index_ext.py +++ /dev/null @@ -1,415 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2019-2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -import numba -import numpy as np -import operator -import pandas as pd - -from numba import types, prange -from numba.core import cgutils -from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, ) -from numba.core.errors import TypingError -from numba.core.typing.templates import signature -from numba.core.imputils import impl_ret_untracked, call_getiter - -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType -from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method -from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array, check_signed_integer -from sdc.functions import numpy_like -from numba.core.boxing import box_array, unbox_array -from sdc.hiframes.api import fix_df_index -from sdc.extensions.indexes.indexes_generic import _check_dtype_param_type - - -@intrinsic -def init_int64_index(typingctx, data, name=None): - - if not (isinstance(data, types.Array) and data.dtype is types.int64): - return None - assert data.ndim == 1, "Index data must be 1-dimensional" - - name = types.none if name is None else name - is_named = False if name is types.none else True - - def codegen(context, builder, sig, args): - data_val, name_val = args - # create series struct and store values - int64_index = cgutils.create_struct_proxy( - sig.return_type)(context, builder) - - int64_index.data = data_val - - if is_named: - if isinstance(name, types.StringLiteral): - int64_index.name = numba.cpython.unicode.make_string_from_constant( - context, builder, types.unicode_type, name.literal_value) - else: - int64_index.name = name_val - - if context.enable_nrt: - context.nrt.incref(builder, sig.args[0], data_val) - if is_named: - context.nrt.incref(builder, sig.args[1], name_val) - - return int64_index._getvalue() - - ret_typ = Int64IndexType(data, is_named) - sig = signature(ret_typ, data, name) - return sig, codegen - - -@sdc_overload(pd.Int64Index) -def pd_int64_index_overload(data, dtype=None, copy=False, name=None): - - _func_name = 'pd.Int64Index().' - ty_checker = TypeChecker(_func_name) - - if not (isinstance(data, (types.Array, types.List)) and isinstance(data.dtype, types.Integer) - or isinstance(data, (RangeIndexType, Int64IndexType))): - ty_checker.raise_exc(data, 'array/list of integers or integer index', 'data') - - dtype_is_number_class = isinstance(dtype, types.NumberClass) - dtype_is_numpy_signed_int = (check_signed_integer(dtype) - or dtype_is_number_class and check_signed_integer(dtype.dtype)) - dtype_is_unicode_str = isinstance(dtype, (types.UnicodeType, types.StringLiteral)) - if not _check_dtype_param_type(dtype): - ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') - - if not (isinstance(copy, (types.NoneType, types.Omitted, types.Boolean)) or copy is False): - ty_checker.raise_exc(copy, 'bool', 'copy') - - if not (isinstance(name, (types.NoneType, types.Omitted, types.StringLiteral, types.UnicodeType)) or name is None): - ty_checker.raise_exc(name, 'string or none', 'name') - - is_data_array = isinstance(data, types.Array) - is_data_index = isinstance(data, (RangeIndexType, Int64IndexType)) - data_dtype_is_int64 = data.dtype is types.int64 - - def pd_int64_index_ctor_impl(data, dtype=None, copy=False, name=None): - - if not (dtype is None - or dtype_is_numpy_signed_int - or dtype_is_unicode_str and dtype in ('int8', 'int16', 'int32', 'int64')): - raise ValueError("Incorrect `dtype` passed: expected signed integer") - - if is_data_array == True: # noqa - _data = data - elif is_data_index == True: # noqa - _data = data.values - else: - _data = fix_df_index(data)._data - - if data_dtype_is_int64 == False: # noqa - _data = numpy_like.astype(_data, dtype=types.int64) - else: - if copy: - _data = np.copy(_data) - return init_int64_index(_data, name) - - return pd_int64_index_ctor_impl - - -@typeof_impl.register(pd.Int64Index) -def typeof_int64_index(val, c): - index_data_ty = numba.typeof(val._data) - is_named = val.name is not None - return Int64IndexType(index_data_ty, is_named=is_named) - - -@box(Int64IndexType) -def box_int64_index(typ, val, c): - - mod_name = c.context.insert_const_string(c.builder.module, "pandas") - pd_class_obj = c.pyapi.import_module_noblock(mod_name) - - int64_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) - data = box_array(typ.data, int64_index.data, c) - - # dtype and copy params are not stored so use default values - dtype = c.pyapi.make_none() - copy = c.pyapi.bool_from_bool( - c.context.get_constant(types.bool_, False) - ) - - if typ.is_named: - name = c.pyapi.from_native_value(types.unicode_type, int64_index.name) - else: - name = c.pyapi.make_none() - - res = c.pyapi.call_method(pd_class_obj, "Int64Index", (data, dtype, copy, name)) - - c.pyapi.decref(data) - c.pyapi.decref(dtype) - c.pyapi.decref(copy) - c.pyapi.decref(name) - c.pyapi.decref(pd_class_obj) - return res - - -@unbox(Int64IndexType) -def unbox_int64_index(typ, val, c): - - # TODO: support index unboxing with reference to parent in Numba? - int64_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) - index_data = c.pyapi.object_getattr_string(val, "_data") - int64_index.data = unbox_array(typ.data, index_data, c).value - c.pyapi.decref(index_data) - - if typ.is_named: - name_obj = c.pyapi.object_getattr_string(val, "name") - int64_index.name = numba.cpython.unicode.unbox_unicode_str( - types.unicode_type, name_obj, c).value - c.pyapi.decref(name_obj) - - is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) - return NativeValue(int64_index._getvalue(), is_error=is_error) - - -@sdc_overload_attribute(Int64IndexType, 'name') -def pd_int64_index_name_overload(self): - if not isinstance(self, Int64IndexType): - return None - - is_named_index = self.is_named - - def pd_int64_index_name_impl(self): - if is_named_index == True: # noqa - return self._name - else: - return None - - return pd_int64_index_name_impl - - -@sdc_overload_attribute(Int64IndexType, 'dtype') -def pd_int64_index_dtype_overload(self): - if not isinstance(self, Int64IndexType): - return None - - range_index_dtype = self.dtype - - def pd_int64_index_dtype_impl(self): - return range_index_dtype - - return pd_int64_index_dtype_impl - - -@sdc_overload_attribute(Int64IndexType, 'values') -def pd_int64_index_values_overload(self): - if not isinstance(self, Int64IndexType): - return None - - def pd_int64_index_values_impl(self): - return self._data - - return pd_int64_index_values_impl - - -@sdc_overload(len) -def pd_int64_index_len_overload(self): - if not isinstance(self, Int64IndexType): - return None - - def pd_int64_index_len_impl(self): - return len(self._data) - - return pd_int64_index_len_impl - - -@sdc_overload(operator.contains) -def pd_int64_index_contains_overload(self, val): - if not isinstance(self, Int64IndexType): - return None - - _func_name = 'Operator contains().' - ty_checker = TypeChecker(_func_name) - - if not (isinstance(val, types.Integer)): - ty_checker.raise_exc(val, 'integer scalar', 'val') - - def pd_int64_index_contains_impl(self, val): - # TO-DO: add operator.contains support for arrays in Numba - found = 0 - for i in prange(len(self._data)): - if val == self._data[i]: - found += 1 - - return found > 0 - - return pd_int64_index_contains_impl - - -@sdc_overload_method(Int64IndexType, 'copy') -def pd_int64_index_copy_overload(self, name=None, deep=False, dtype=None): - if not isinstance(self, Int64IndexType): - return None - - _func_name = 'Method copy().' - ty_checker = TypeChecker(_func_name) - - if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): - ty_checker.raise_exc(name, 'string or none', 'name') - - if not (isinstance(deep, (types.Omitted, types.Boolean)) or deep is False): - ty_checker.raise_exc(deep, 'boolean', 'deep') - - if not _check_dtype_param_type(dtype): - ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') - - name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None - keep_name = name_is_none and self.is_named - - def pd_int64_index_copy_impl(self, name=None, deep=False, dtype=None): - - _name = self._name if keep_name == True else name # noqa - new_index_data = self._data if not deep else numpy_like.copy(self._data) - return init_int64_index(new_index_data, _name) - - return pd_int64_index_copy_impl - - -@sdc_overload(operator.getitem) -def pd_int64_index_getitem_overload(self, idx): - if not isinstance(self, Int64IndexType): - return None - - _func_name = 'Operator getitem().' - ty_checker = TypeChecker(_func_name) - - if not (isinstance(idx, (types.Integer, types.SliceType)) - or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): - ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') - - if isinstance(idx, types.Integer): - def pd_int64_index_getitem_impl(self, idx): - index_len = len(self._data) - # FIXME_Numba#5801: Numba type unification rules make this float - idx = types.int64((index_len + idx) if idx < 0 else idx) - if (idx < 0 or idx >= index_len): - raise IndexError("Int64Index.getitem: index is out of bounds") - - return self._data[idx] - - return pd_int64_index_getitem_impl - - else: - def pd_int64_index_getitem_impl(self, idx): - index_data = self._data[idx] - return pd.Int64Index(index_data, name=self._name) - - return pd_int64_index_getitem_impl - - -# TO-DO: this and many other impls are generic and should be moved to indexes_generic.py -@sdc_overload(operator.eq) -def pd_int64_index_eq_overload(self, other): - - self_is_index = isinstance(self, Int64IndexType) - other_is_index = isinstance(other, Int64IndexType) - - if not (self_is_index and other_is_index - or (self_is_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_index)): - return None - one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) - - def pd_int64_index_eq_impl(self, other): - - if one_operand_is_scalar == False: # noqa - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - # names do not matter when comparing pd.Int64Index - left = self.values if self_is_index == True else self # noqa - right = other.values if other_is_index == True else other # noqa - return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed - - return pd_int64_index_eq_impl - - -@sdc_overload(operator.ne) -def pd_int64_index_ne_overload(self, other): - - self_is_index = isinstance(self, Int64IndexType) - other_is_index = isinstance(other, Int64IndexType) - - if not (self_is_index and other_is_index - or (self_is_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_index)): - return None - - def pd_int64_index_ne_impl(self, other): - - eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list - return list(~eq_res) - - return pd_int64_index_ne_impl - - -@lower_builtin(operator.is_, Int64IndexType, Int64IndexType) -def pd_int64_index_is_overload(context, builder, sig, args): - - ty_lhs, ty_rhs = sig.args - if ty_lhs != ty_rhs: - return cgutils.false_bit - - lhs, rhs = args - lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) - rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) - return builder.icmp_signed('==', lhs_ptr, rhs_ptr) - - -@lower_builtin('getiter', Int64IndexType) -def pd_int64_index_getiter(context, builder, sig, args): - """ Returns a new iterator object for Int64IndexType by delegating to array __iter__ """ - (value,) = args - int64_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) - res = call_getiter(context, builder, sig.args[0].data, int64_index.data) - return impl_ret_untracked(context, builder, Int64IndexType, res) - - -@sdc_overload_method(Int64IndexType, 'ravel') -def pd_int64_index_ravel_overload(self, order='C'): - if not isinstance(self, Int64IndexType): - return None - - _func_name = 'Method ravel().' - - # np.ravel argument order is not supported in Numba - if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): - raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) - - def pd_int64_index_ravel_impl(self, order='C'): - # np.ravel argument order is not supported in Numba - if order != 'C': - raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") - - return self.values - - return pd_int64_index_ravel_impl diff --git a/sdc/extensions/indexes/range_index_ext.py b/sdc/extensions/indexes/range_index_ext.py index cc04cfcba..7b24e7528 100644 --- a/sdc/extensions/indexes/range_index_ext.py +++ b/sdc/extensions/indexes/range_index_ext.py @@ -33,18 +33,23 @@ from numba import types from numba.core import cgutils from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, ) -from numba.core.errors import TypingError + from numba.core.typing.templates import signature from numba.core.imputils import impl_ret_untracked, call_getiter from sdc.datatypes.range_index_type import RangeIndexType, RangeIndexDataType from sdc.datatypes.common_functions import SDCLimitation, _sdc_take from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method -from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array, check_signed_integer +from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array from sdc.functions.numpy_like import getitem_by_mask -from sdc.functions.numpy_like import astype as nplike_astype -from numba.core.boxing import box_array, unbox_array -from sdc.extensions.indexes.indexes_generic import _check_dtype_param_type + + +def _check_dtype_param_type(dtype): + """ Returns True is dtype is a valid type for dtype parameter and False otherwise. + Used in RangeIndex ctor and other methods that take dtype parameter. """ + + valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) + return isinstance(dtype, valid_dtype_types) or dtype is None @intrinsic @@ -91,9 +96,8 @@ def pd_range_index_overload(start=None, stop=None, step=None, dtype=None, copy=F if not (isinstance(copy, types.Omitted) or fastpath is None): raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'fastpath': {fastpath}") - dtype_is_number_class = isinstance(dtype, types.NumberClass) - dtype_is_numpy_signed_int = (check_signed_integer(dtype) - or dtype_is_number_class and check_signed_integer(dtype.dtype)) + dtype_is_np_int64 = dtype is types.NumberClass(types.int64) + dtype_is_np_int32 = dtype is types.NumberClass(types.int32) dtype_is_unicode_str = isinstance(dtype, (types.UnicodeType, types.StringLiteral)) if not _check_dtype_param_type(dtype): ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') @@ -121,8 +125,10 @@ def pd_range_index_ctor_dummy_impl( def pd_range_index_ctor_impl(start=None, stop=None, step=None, dtype=None, copy=False, name=None, fastpath=None): if not (dtype is None - or dtype_is_numpy_signed_int - or dtype_is_unicode_str and dtype in ('int8', 'int16', 'int32', 'int64')): + or dtype_is_unicode_str and dtype == 'int64' + or dtype_is_unicode_str and dtype == 'int32' + or dtype_is_np_int64 + or dtype_is_np_int32): raise ValueError("Incorrect `dtype` passed: expected signed integer") # TODO: add support of int32 type @@ -350,8 +356,7 @@ def pd_range_index_getitem_overload(self, idx): if isinstance(idx, types.Integer): def pd_range_index_getitem_impl(self, idx): range_len = len(self._data) - # FIXME_Numba#5801: Numba type unification rules make this float - idx = types.int64((range_len + idx) if idx < 0 else idx) + idx = (range_len + idx) if idx < 0 else idx if (idx < 0 or idx >= range_len): raise IndexError("RangeIndex.getitem: index is out of bounds") return self.start + self.step * idx @@ -370,12 +375,12 @@ def pd_range_index_getitem_impl(self, idx): return pd_range_index_getitem_impl + # returns np.array which is used to represent pandas Int64Index now if isinstance(idx, (types.Array, types.List)): if isinstance(idx.dtype, types.Integer): def pd_range_index_getitem_impl(self, idx): - res_as_arr = _sdc_take(self, idx) - return pd.Int64Index(res_as_arr, name=self._name) + return _sdc_take(self, idx) return pd_range_index_getitem_impl elif isinstance(idx.dtype, types.Boolean): @@ -393,7 +398,7 @@ def pd_range_index_eq_overload(self, other): if not (self_is_range_index and other_is_range_index or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_range_index)): + or ((check_is_numeric_array(self) or isinstance(self, types.Number) and other_is_range_index))): return None one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) @@ -419,7 +424,7 @@ def pd_range_index_ne_overload(self, other): if not (self_is_range_index and other_is_range_index or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_range_index)): + or ((check_is_numeric_array(self) or isinstance(self, types.Number) and other_is_range_index))): return None def pd_range_index_ne_impl(self, other): @@ -448,25 +453,5 @@ def pd_range_index_getiter(context, builder, sig, args): """ Returns a new iterator object for RangeIndexType by delegating to range.__iter__ """ (value,) = args range_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) - res = call_getiter(context, builder, RangeIndexDataType, range_index.data) + res = call_getiter(context, builder, types.range_state64_type, range_index.data) return impl_ret_untracked(context, builder, RangeIndexType, res) - - -@sdc_overload_method(RangeIndexType, 'ravel') -def pd_range_index_ravel_overload(self, order='C'): - if not isinstance(self, RangeIndexType): - return None - - _func_name = 'Method ravel().' - - if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): - raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) - - def pd_range_index_ravel_impl(self, order='C'): - # np.ravel argument order is not supported in Numba - if order != 'C': - raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") - - return self.values - - return pd_range_index_ravel_impl diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index aa9e00a70..1e67e8ccd 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -47,7 +47,6 @@ from sdc.functions.statistics import skew_formula from sdc.hiframes.api import isna from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.utilities.sdc_typing_utils import TypeChecker, is_default from sdc.utilities.utils import (sdc_overload, sdc_register_jitable, min_dtype_int_val, max_dtype_int_val, min_dtype_float_val, @@ -58,8 +57,6 @@ from sdc.utilities.prange_utils import parallel_chunks from sdc.utilities.sdc_typing_utils import check_types_comparable from sdc.functions.sort import parallel_sort, parallel_stable_sort, parallel_argsort, parallel_stable_argsort -from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types - def astype(self, dtype): pass @@ -123,9 +120,7 @@ def sdc_astype_overload(self, dtype): """ ty_checker = TypeChecker("numpy-like 'astype'") - valid_self_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(self, valid_self_types) - and not isinstance(self, types.NoneType)): + if not isinstance(self, (types.Array, StringArrayType, RangeIndexType)): return None accepted_dtype_types = (types.functions.NumberClass, types.Function, types.StringLiteral) @@ -161,7 +156,7 @@ def sdc_astype_number_to_string_impl(self, dtype): return sdc_astype_number_to_string_impl - if (isinstance(self, (types.Array, RangeIndexType, Int64IndexType)) + if (isinstance(self, (types.Array, RangeIndexType)) and isinstance(dtype, (types.StringLiteral, types.functions.NumberClass))): def sdc_astype_number_impl(self, dtype): arr = numpy.empty(len(self), dtype=numpy.dtype(dtype)) @@ -349,9 +344,7 @@ def sdc_copy_overload(self): Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k copy """ - valid_self_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(self, valid_self_types) - and not isinstance(self, types.NoneType)): + if not isinstance(self, (types.Array, StringArrayType, RangeIndexType)): return None if isinstance(self, types.Array): @@ -367,7 +360,7 @@ def sdc_copy_array_impl(self): return sdc_copy_array_impl - if isinstance(self, (StringArrayType, RangeIndexType, Int64IndexType)): + if isinstance(self, (StringArrayType, RangeIndexType)): def sdc_copy_str_arr_impl(self): return self.copy() @@ -960,7 +953,7 @@ def getitem_by_mask(arr, idx): @sdc_overload(getitem_by_mask) -def getitem_by_mask_overload(self, idx): +def getitem_by_mask_overload(arr, idx): """ Creates a new array from arr by selecting elements indicated by Boolean mask idx. @@ -978,17 +971,13 @@ def getitem_by_mask_overload(self, idx): """ - valid_self_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(self, valid_self_types) - and not isinstance(self, types.NoneType)): - return None - - res_dtype = self.dtype - is_str_arr = self == string_array_type - is_numeric_index = isinstance(self, (RangeIndexType, Int64IndexType)) + if not isinstance(arr, (types.Array, StringArrayType, RangeIndexType)): + return - def getitem_by_mask_impl(self, idx): - chunks = parallel_chunks(len(self)) + res_dtype = arr.dtype + is_str_arr = arr == string_array_type + def getitem_by_mask_impl(arr, idx): + chunks = parallel_chunks(len(arr)) arr_len = numpy.empty(len(chunks), dtype=numpy.int64) length = 0 @@ -1013,18 +1002,16 @@ def getitem_by_mask_impl(self, idx): for j in range(chunk.start, chunk.stop): if idx[j]: - value = self[j] + value = arr[j] result_data[current_pos] = value if is_str_arr == True: # noqa - result_nan_mask[current_pos] = isna(self, j) + result_nan_mask[current_pos] = isna(arr, j) current_pos += 1 if is_str_arr == True: # noqa result_data_as_str_arr = create_str_arr_from_list(result_data) str_arr_set_na_by_mask(result_data_as_str_arr, result_nan_mask) return result_data_as_str_arr - elif is_numeric_index == True: # noqa - return pandas.Int64Index(result_data, name=self._name) else: return result_data @@ -1101,8 +1088,8 @@ def array_equal(A, B): def sdc_array_equal_overload(A, B): """ Checks 1D sequences A and B of comparable dtypes are equal """ - valid_arg_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(A, valid_arg_types) or isinstance(B, valid_arg_types)): + if not (isinstance(A, (types.Array, StringArrayType, types.NoneType, RangeIndexType)) + or isinstance(B, (types.Array, StringArrayType, types.NoneType, RangeIndexType))): return None _func_name = "numpy-like 'array_equal'" @@ -1155,9 +1142,6 @@ def sdc_np_array_overload(A): if isinstance(A, RangeIndexType): return lambda A: np.arange(A.start, A.stop, A.step) - if isinstance(A, Int64IndexType): - return lambda A: A._data - if isinstance(A, types.containers.Set): # TODO: naive implementation, data from set can probably # be copied to array more efficienty @@ -1225,7 +1209,7 @@ def sort_impl(a, axis=-1, kind=None, order=None): return sort_impl -def argsort(a, axis=-1, kind=None, order=None, ascending=True): +def argsort(a, axis=-1, kind=None, order=None): """ Returns the indices that would sort an array. @@ -1254,7 +1238,7 @@ def argsort(a, axis=-1, kind=None, order=None, ascending=True): @sdc_overload(argsort) -def argsort_overload(a, axis=-1, kind=None, order=None, ascending=True): +def argsort_overload(a, axis=-1, kind=None, order=None): _func_name = 'argsort' ty_checker = TypeChecker(_func_name) @@ -1266,15 +1250,15 @@ def argsort_overload(a, axis=-1, kind=None, order=None, ascending=True): if not is_default(order, None): raise TypingError(f'{_func_name} Unsupported parameter order') - def argsort_impl(a, axis=-1, kind=None, order=None, ascending=True): + def argsort_impl(a, axis=-1, kind=None, order=None): _kind = 'quicksort' if kind is not None: _kind = kind if _kind == 'quicksort': - return parallel_argsort(a, ascending) + return parallel_argsort(a) elif _kind == 'mergesort': - return parallel_stable_argsort(a, ascending) + return parallel_stable_argsort(a) else: raise ValueError("Unsupported value of 'kind' parameter") diff --git a/sdc/functions/sort.py b/sdc/functions/sort.py index 78a332d12..7ddbc8f31 100644 --- a/sdc/functions/sort.py +++ b/sdc/functions/sort.py @@ -47,7 +47,7 @@ def bind(sym, sig): parallel_sort_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_uint64, ct.c_uint64, ct.c_void_p,) -parallel_argsort_arithm_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64, ct.c_uint8) +parallel_argsort_arithm_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64) parallel_argsort_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64, ct.c_uint64, ct.c_void_p,) @@ -66,7 +66,7 @@ def bind(sym, sig): parallel_sort_t_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_uint64) -parallel_argsort_t_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64, ct.c_uint8) +parallel_argsort_t_sig = ct.CFUNCTYPE(None, ct.c_void_p, ct.c_void_p, ct.c_uint64) set_threads_count_sig = ct.CFUNCTYPE(None, ct.c_uint64) set_threads_count_sym = bind('set_number_of_threads', set_threads_count_sig) @@ -290,19 +290,17 @@ def parallel_xargsort_overload_impl(dt, xargsort_map, xargsort_sym): if dt in types_to_postfix.keys(): sort_f = xargsort_map[dt] - def parallel_xargsort_arithm_impl(arr, ascending=True): + def parallel_xargsort_arithm_impl(arr): index = numpy.empty(shape=len(arr), dtype=numpy.int64) - sort_f(index.ctypes, arr.ctypes, len(arr), types.uint8(ascending)) + sort_f(index.ctypes, arr.ctypes, len(arr)) return index return parallel_xargsort_arithm_impl - # TO-DO: add/change adaptor to handle case of ascending=False - def parallel_xargsort_impl(arr, ascending=True): + def parallel_xargsort_impl(arr): item_size = itemsize(arr) index = numpy.empty(shape=len(arr), dtype=numpy.int64) - xargsort_sym(index.ctypes, arr.ctypes, len(arr), item_size, adaptor(arr[0], arr[0])) return index @@ -310,12 +308,12 @@ def parallel_xargsort_impl(arr, ascending=True): return parallel_xargsort_impl -def parallel_argsort(arr, ascending=True): +def parallel_argsort(arr): pass @overload(parallel_argsort) -def parallel_argsort_overload(arr, ascending=True): +def parallel_argsort_overload(arr): if not isinstance(arr, types.Array): raise NotImplementedError @@ -325,12 +323,12 @@ def parallel_argsort_overload(arr, ascending=True): return parallel_xargsort_overload_impl(dt, argsort_map, parallel_argsort_sym) -def parallel_stable_argsort(arr, ascending=True): +def parallel_stable_argsort(arr): pass @overload(parallel_stable_argsort) -def parallel_stable_argsort_overload(arr, ascending=True): +def parallel_argsort_overload(arr): if not isinstance(arr, types.Array): raise NotImplementedError diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 8a0b7f622..77436f49b 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -39,7 +39,6 @@ from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (StringArrayType, string_array_type) from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.hiframes.pd_series_ext import ( SeriesType, if_series_to_array_type) @@ -161,7 +160,7 @@ def fix_df_array_list_str_impl(column): # pragma: no cover if isinstance(column, SeriesType): return lambda column: column._data - if isinstance(column, (RangeIndexType, Int64IndexType)): + if isinstance(column, RangeIndexType): return lambda column: np.array(column) if isinstance(column, (types.Array, StringArrayType, Categorical)): @@ -180,16 +179,10 @@ def fix_df_index_overload(index): def fix_df_index_impl(index): return None - elif isinstance(index, (RangeIndexType, Int64IndexType)): + elif isinstance(index, RangeIndexType): def fix_df_index_impl(index): return index - # currently only signed integer indexes are represented with own type - # TO-DO: support Uint64Index and Float64Indexes - elif isinstance(index.dtype, types.Integer) and index.dtype.signed: - def fix_df_index_impl(index): - index_data = fix_df_array(index) - return pd.Int64Index(index_data) else: # default case, transform index the same as df data def fix_df_index_impl(index): diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 12cb8850f..5e6930da9 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -47,17 +47,15 @@ from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical from sdc.hiframes.pd_series_ext import SeriesType from sdc.hiframes.pd_series_type import _get_series_array_type + from sdc.hiframes.pd_dataframe_ext import get_structure_maps -from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types from .. import hstr_ext import llvmlite.binding as ll from llvmlite import ir as lir from llvmlite.llvmpy.core import Type as LLType from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.extensions.indexes.range_index_ext import box_range_index, unbox_range_index -from sdc.extensions.indexes.int64_index_ext import box_int64_index, unbox_int64_index from sdc.str_arr_type import StringArrayType ll.add_symbol('array_size', hstr_ext.array_size) ll.add_symbol('array_getptr1', hstr_ext.array_getptr1) @@ -194,8 +192,6 @@ def _infer_series_list_dtype(S): def _infer_index_type(index): """ Deduces native Numba type used to represent index Python object """ - - # more specific types go first (e.g. RangeIndex is subtype of Int64Index) if isinstance(index, pd.RangeIndex): # depending on actual index value unbox to diff types: none-index if it matches # positions or to RangeIndexType in general case @@ -210,14 +206,6 @@ def _infer_index_type(index): # for unsupported pandas indexes we explicitly unbox to None if isinstance(index, pd.DatetimeIndex): return types.none - - if isinstance(index, pd.Int64Index): - index_data_type = numba.typeof(index._data) - if index.name is None: - return Int64IndexType(index_data_type) - else: - return Int64IndexType(index_data_type, is_named=True) - if index.dtype == np.dtype('O'): # TO-DO: should we check that all elements are strings? if len(index) > 0 and isinstance(index[0], str): @@ -335,14 +323,9 @@ def _unbox_index_data(index_typ, index_obj, c): if isinstance(index_typ, RangeIndexType): return unbox_range_index(index_typ, index_obj, c) - if isinstance(index_typ, Int64IndexType): - return unbox_int64_index(index_typ, index_obj, c) - if index_typ == string_array_type: return unbox_str_series(index_typ, index_obj, c) - # this is still here only because of Float64Index represented as array - # TO-DO: remove when it's added if isinstance(index_typ, types.Array): index_data = c.pyapi.object_getattr_string(index_obj, "_data") res = unbox_array(index_typ, index_data, c) @@ -454,12 +437,10 @@ def _box_index_data(index_typ, val, c): c: LLVM context object Returns: Python object native value is boxed into """ - assert isinstance(index_typ, sdc_pandas_index_types) + assert isinstance(index_typ, (RangeIndexType, StringArrayType, types.Array, types.NoneType)) if isinstance(index_typ, RangeIndexType): index = box_range_index(index_typ, val, c) - elif isinstance(index_typ, Int64IndexType): - index = box_int64_index(index_typ, val, c) elif isinstance(index_typ, types.Array): index = box_array(index_typ, val, c) elif isinstance(index_typ, StringArrayType): diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 0f062a4bf..d48aaf0f1 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -48,6 +48,7 @@ from sdc.hiframes.pd_series_type import SeriesType from sdc.datatypes.categorical.pdimpl import is_categoricaldtype from sdc.datatypes.series.pdimpl import _Series_category +from sdc.datatypes.range_index_type import RangeIndexType def is_str_series_typ(t): diff --git a/sdc/native/module.cpp b/sdc/native/module.cpp index 1571508ab..c0d9ff606 100644 --- a/sdc/native/module.cpp +++ b/sdc/native/module.cpp @@ -60,31 +60,31 @@ extern "C" void parallel_argsort_u64v(void* index, void* begin, uint64_t len, uint64_t size, void* compare); - void parallel_argsort_u64i8(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64u8(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64i16(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64u16(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64i32(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64u32(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64i64(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64u64(void* index, void* begin, uint64_t len, uint8_t ascending); - - void parallel_argsort_u64f32(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_argsort_u64f64(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_argsort_u64i8(void* index, void* begin, uint64_t len); + void parallel_argsort_u64u8(void* index, void* begin, uint64_t len); + void parallel_argsort_u64i16(void* index, void* begin, uint64_t len); + void parallel_argsort_u64u16(void* index, void* begin, uint64_t len); + void parallel_argsort_u64i32(void* index, void* begin, uint64_t len); + void parallel_argsort_u64u32(void* index, void* begin, uint64_t len); + void parallel_argsort_u64i64(void* index, void* begin, uint64_t len); + void parallel_argsort_u64u64(void* index, void* begin, uint64_t len); + + void parallel_argsort_u64f32(void* index, void* begin, uint64_t len); + void parallel_argsort_u64f64(void* index, void* begin, uint64_t len); void parallel_stable_argsort_u64v(void* index, void* begin, uint64_t len, uint64_t size, void* compare); - void parallel_stable_argsort_u64i8(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64u8(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64i16(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64u16(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64i32(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64u32(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64i64(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64u64(void* index, void* begin, uint64_t len, uint8_t ascending); - - void parallel_stable_argsort_u64f32(void* index, void* begin, uint64_t len, uint8_t ascending); - void parallel_stable_argsort_u64f64(void* index, void* begin, uint64_t len, uint8_t ascending); + void parallel_stable_argsort_u64i8(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64u8(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64i16(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64u16(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64i32(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64u32(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64i64(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64u64(void* index, void* begin, uint64_t len); + + void parallel_stable_argsort_u64f32(void* index, void* begin, uint64_t len); + void parallel_stable_argsort_u64f64(void* index, void* begin, uint64_t len); void set_number_of_threads(uint64_t threads) { diff --git a/sdc/native/sort.cpp b/sdc/native/sort.cpp index 9a6b96117..2e92c46b9 100644 --- a/sdc/native/sort.cpp +++ b/sdc/native/sort.cpp @@ -92,16 +92,8 @@ void parallel_argsort_(I* index, void* data, uint64_t len, uint64_t size, compar } // namespace #define declare_single_argsort(index_prefix, type_prefix, ity, ty) \ -void parallel_argsort_##index_prefix##type_prefix(void* index, void* begin, uint64_t len, uint8_t ascending) \ -{ \ - if (ascending) { \ - auto cmp = utils::less(); \ - parallel_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ - } else { \ - auto cmp = utils::greater(); \ - parallel_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ - } \ -} +void parallel_argsort_##index_prefix##type_prefix(void* index, void* begin, uint64_t len) \ +{ parallel_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len); } #define declare_argsort(prefix, ty) \ declare_single_argsort(u8, prefix, uint8_t, ty) \ diff --git a/sdc/native/stable_sort.cpp b/sdc/native/stable_sort.cpp index 05685086f..38b5e2758 100644 --- a/sdc/native/stable_sort.cpp +++ b/sdc/native/stable_sort.cpp @@ -281,16 +281,8 @@ struct parallel_sort_fixed_size } // namespace #define declare_single_argsort(index_prefix, type_prefix, ity, ty) \ -void parallel_stable_argsort_##index_prefix##type_prefix(ity* index, void* begin, uint64_t len, uint8_t ascending) \ -{ \ - if (ascending) { \ - auto cmp = utils::less(); \ - parallel_stable_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ - } else { \ - auto cmp = utils::greater(); \ - parallel_stable_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len, cmp); \ - } \ -} +void parallel_stable_argsort_##index_prefix##type_prefix(ity* index, void* begin, uint64_t len) \ +{ parallel_stable_argsort_(reinterpret_cast(index), reinterpret_cast(begin), len); } #define declare_argsort(prefix, ty) \ declare_single_argsort(u8, prefix, uint8_t, ty) \ @@ -347,4 +339,4 @@ void parallel_stable_sort(void* begin, uint64_t len, uint64_t size, void* compar #undef declare_int_sort #undef declare_sort #undef declare_argsort -#undef declare_single_argsort +#undef declare_single_argsort \ No newline at end of file diff --git a/sdc/native/utils.cpp b/sdc/native/utils.cpp index 7d5985d25..8067c094d 100644 --- a/sdc/native/utils.cpp +++ b/sdc/native/utils.cpp @@ -169,16 +169,4 @@ bool nanless(const double& left, const double& right) return std::less()(left, right) || (std::isnan(right) && !std::isnan(left)); } -template<> -bool nangreater(const float& left, const float& right) -{ - return std::greater()(left, right) || (std::isnan(right) && !std::isnan(left)); -} - -template<> -bool nangreater(const double& left, const double& right) -{ - return std::greater()(left, right) || (std::isnan(right) && !std::isnan(left)); -} - } diff --git a/sdc/native/utils.hpp b/sdc/native/utils.hpp index 5be6b327b..d3633ca63 100644 --- a/sdc/native/utils.hpp +++ b/sdc/native/utils.hpp @@ -266,27 +266,6 @@ struct less } }; -template -bool nangreater(const T& left, const T& right) -{ - return std::greater()(left, right); -} - -template<> -bool nangreater(const float& left, const float& right); - -template<> -bool nangreater(const double& left, const double& right); - -template -struct greater -{ - bool operator() (const T& left, const T& right) const - { - return nangreater(left, right); - } -}; - namespace tbb_control { void init(); diff --git a/sdc/set_ext.py b/sdc/set_ext.py index 1223c637a..645cf2df6 100644 --- a/sdc/set_ext.py +++ b/sdc/set_ext.py @@ -128,7 +128,6 @@ def _build_str_set_impl(A): str_set.add(_str) return str_set - # TODO: remove since probably unused @overload(set) def init_set_string_array(A): diff --git a/sdc/str_arr_ext.py b/sdc/str_arr_ext.py index e929785f5..1d12a80a5 100644 --- a/sdc/str_arr_ext.py +++ b/sdc/str_arr_ext.py @@ -54,50 +54,6 @@ from sdc.utilities.sdc_typing_utils import check_is_array_of_dtype -ll.add_symbol('get_str_len', hstr_ext.get_str_len) -ll.add_symbol('allocate_string_array', hstr_ext.allocate_string_array) -ll.add_symbol('setitem_string_array', hstr_ext.setitem_string_array) -ll.add_symbol('getitem_string_array', hstr_ext.getitem_string_array) -ll.add_symbol('getitem_string_array_std', hstr_ext.getitem_string_array_std) -ll.add_symbol('is_na', hstr_ext.is_na) -ll.add_symbol('string_array_from_sequence', hstr_ext.string_array_from_sequence) -ll.add_symbol('np_array_from_string_array', hstr_ext.np_array_from_string_array) -ll.add_symbol('print_int', hstr_ext.print_int) -ll.add_symbol('convert_len_arr_to_offset', hstr_ext.convert_len_arr_to_offset) -ll.add_symbol('set_string_array_range', hstr_ext.set_string_array_range) -ll.add_symbol('str_arr_to_int64', hstr_ext.str_arr_to_int64) -ll.add_symbol('str_arr_to_float64', hstr_ext.str_arr_to_float64) -ll.add_symbol('dtor_string_array', hstr_ext.dtor_string_array) -ll.add_symbol('c_glob', hstr_ext.c_glob) -ll.add_symbol('decode_utf8', hstr_ext.decode_utf8) -ll.add_symbol('get_utf8_size', hstr_ext.get_utf8_size) -ll.add_symbol('stable_argsort', hstr_ext.stable_argsort) - - -convert_len_arr_to_offset = types.ExternalFunction("convert_len_arr_to_offset", - types.void(types.voidptr, - types.intp)) - -setitem_string_array = types.ExternalFunction("setitem_string_array", - types.void(types.voidptr, - types.voidptr, - types.intp, - string_type, - types.intp)) - -_get_utf8_size = types.ExternalFunction("get_utf8_size", - types.intp(types.voidptr, # data_ptr - types.intp, # length - types.int32)) # kind - -_stable_argsort = types.ExternalFunction("stable_argsort", - types.void(types.intp, # data_ptr - types.intp, # offset_ptr - types.uint64, # data size - types.int8, # ascending - types.intp)) # result ptr - - @typeof_impl.register(StringArray) def typeof_string_array(val, c): return string_array_type @@ -567,6 +523,34 @@ def str_arr_len(str_arr): return str_arr_len +ll.add_symbol('get_str_len', hstr_ext.get_str_len) +ll.add_symbol('allocate_string_array', hstr_ext.allocate_string_array) +ll.add_symbol('setitem_string_array', hstr_ext.setitem_string_array) +ll.add_symbol('getitem_string_array', hstr_ext.getitem_string_array) +ll.add_symbol('getitem_string_array_std', hstr_ext.getitem_string_array_std) +ll.add_symbol('is_na', hstr_ext.is_na) +ll.add_symbol('string_array_from_sequence', hstr_ext.string_array_from_sequence) +ll.add_symbol('np_array_from_string_array', hstr_ext.np_array_from_string_array) +ll.add_symbol('print_int', hstr_ext.print_int) +ll.add_symbol('convert_len_arr_to_offset', hstr_ext.convert_len_arr_to_offset) +ll.add_symbol('set_string_array_range', hstr_ext.set_string_array_range) +ll.add_symbol('str_arr_to_int64', hstr_ext.str_arr_to_int64) +ll.add_symbol('str_arr_to_float64', hstr_ext.str_arr_to_float64) +ll.add_symbol('dtor_string_array', hstr_ext.dtor_string_array) +ll.add_symbol('c_glob', hstr_ext.c_glob) +ll.add_symbol('decode_utf8', hstr_ext.decode_utf8) +ll.add_symbol('get_utf8_size', hstr_ext.get_utf8_size) + +convert_len_arr_to_offset = types.ExternalFunction("convert_len_arr_to_offset", types.void(types.voidptr, types.intp)) + + +setitem_string_array = types.ExternalFunction("setitem_string_array", + types.void(types.voidptr, types.voidptr, types.intp, string_type, + types.intp)) +_get_utf8_size = types.ExternalFunction("get_utf8_size", + types.intp(types.voidptr, types.intp, types.int32)) + + def construct_string_array(context, builder): """Creates meminfo and sets dtor. """ @@ -1460,14 +1444,3 @@ def sdc_str_arr_operator_is(context, builder, sig, args): ma = builder.ptrtoint(a.meminfo, cgutils.intp_t) mb = builder.ptrtoint(b.meminfo, cgutils.intp_t) return builder.icmp_signed('==', ma, mb) - - -@numba.njit(no_cpython_wrapper=True) -def str_arr_stable_argosort(arr, ascending=True): - argsort_res = np.empty(len(arr), dtype=np.int64) - _stable_argsort(get_data_ptr(arr).data, - get_offset_ptr(arr).data, - len(arr), - types.int8(ascending), - argsort_res.ctypes.data) - return argsort_res diff --git a/sdc/tests/__init__.py b/sdc/tests/__init__.py index eeb4014b8..eaba6a8a9 100644 --- a/sdc/tests/__init__.py +++ b/sdc/tests/__init__.py @@ -44,7 +44,7 @@ from sdc.tests.test_io import * from sdc.tests.test_hpat_jit import * -from sdc.tests.indexes import * +from sdc.tests.test_indexes import * from sdc.tests.test_sdc_numpy import * from sdc.tests.test_prange_utils import * diff --git a/sdc/tests/indexes/__init__.py b/sdc/tests/indexes/__init__.py deleted file mode 100644 index 756d8cb55..000000000 --- a/sdc/tests/indexes/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -# ***************************************************************************** -# Copyright (c) 2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -from sdc.tests.indexes.test_range_index import TestRangeIndex -from sdc.tests.indexes.test_int64_index import TestInt64Index -from sdc.tests.indexes.test_indexes import TestIndexes diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py deleted file mode 100644 index ba1ea5700..000000000 --- a/sdc/tests/indexes/index_datagens.py +++ /dev/null @@ -1,88 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -import numpy as np -import pandas as pd -from itertools import (combinations_with_replacement, filterfalse, chain) -from sdc.tests.test_utils import gen_strlist - - -test_global_index_names = [None, 'abc', 'index'] -test_global_range_member_values = [1, 2, 10, -5, 0, None] - - -def _generate_valid_range_params(): - - def valid_params_predicate(range_params): - # if step is zero or all start/stop/step are None range is invalid - return (range_params[-1] == 0 - or all(map(lambda x: x is None, range_params))) - - return filterfalse( - valid_params_predicate, - combinations_with_replacement(test_global_range_member_values, 3) - ) - - -def _generate_range_indexes_fixed(size, start=1, step=3): - yield pd.RangeIndex(size) - yield pd.RangeIndex(size, name='abc') - yield pd.RangeIndex(stop=step * size, step=step) - yield pd.RangeIndex(stop=2*step*size, step=2*step) - yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) - yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) - - -def _generate_index_param_values(n): - return chain( - [None], - _generate_range_indexes_fixed(n), - _generate_int64_indexes_fixed(n), - [np.arange(n) / 2], - [np.arange(n, dtype=np.uint64)], - [gen_strlist(n)], - ) - - -def _generate_valid_int64_index_data(): - n = 100 - yield np.arange(n) - yield np.arange(n) % 2 - yield np.ones(n, dtype=np.int16) - yield list(np.arange(n)) - yield pd.RangeIndex(n) - yield pd.Int64Index(np.arange(n)) - yield np.arange(n) * 2 - yield np.arange(2 * n) - - -def _generate_int64_indexes_fixed(size): - yield pd.Int64Index(np.arange(size)) - yield pd.Int64Index(np.arange(size), name='abc') - yield pd.Int64Index([i if i % 2 else 0 for i in range(size)]) - yield pd.Int64Index([i // 2 for i in range(size)]) - yield pd.Int64Index(np.ones(size)) diff --git a/sdc/tests/indexes/test_indexes.py b/sdc/tests/indexes/test_indexes.py deleted file mode 100644 index fa8bf6f71..000000000 --- a/sdc/tests/indexes/test_indexes.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -import numpy as np -import pandas as pd -import unittest - -from sdc.tests.indexes import TestRangeIndex, TestInt64Index -from sdc.tests.indexes.index_datagens import _generate_index_param_values - - -class TestIndexes( - TestRangeIndex, - TestInt64Index - ): - """ This suite combines tests from all concrete index-type suites and also adds - tests for common use-cases that need to be checked for all index-types. """ - - def assert_indexes_equal(self, index1, index2): - # for SDC indexes that are represented with arrays (e.g. Uint64Index) - supported_pandas_indexes = (pd.RangeIndex, pd.Int64Index, ) - if (not isinstance(index1, supported_pandas_indexes) - or not isinstance(index2, supported_pandas_indexes)): - index1 = np.asarray(index1) - index2 = np.asarray(index2) - np.testing.assert_array_equal(index1, index2) - else: - pd.testing.assert_index_equal(index1, index2) - - @unittest.skip("TODO: support boxing/unboxing and parent ref for Python ranges in Numba") - def test_indexes_unbox_data_id_check(self): - def test_impl(index): - return index - sdc_func = self.jit(test_impl) - - n = 11 - indexes_to_test = [ - pd.RangeIndex(n, name='abc'), # only this one fails, other pass - pd.Int64Index(np.arange(n), name='abc'), - ] - data_attr_names_map = { - pd.RangeIndex: '_range', - pd.Int64Index: '_data', - } - - for index in indexes_to_test: - with self.subTest(index_type=type(index)): - result = sdc_func(index) - result_ref = test_impl(index) - - data1, data2, data3 = map( - lambda x: getattr(x, data_attr_names_map[type(x)]), - [index, result, result_ref] - ) - self.assertIs(data1, data3) - self.assertIs(data2, data3) - - @unittest.skip("Needs writable native struct type members in Numba") - def test_indexes_named_set_name(self): - def test_impl(index): - index.name = 'def' - return index - sdc_func = self.jit(test_impl) - - n = 11 - indexes_to_test = [ - pd.RangeIndex(n, name='abc'), - pd.Int64Index(np.arange(n), name='abc'), - ] - - for index in indexes_to_test: - with self.subTest(index_type=type(index)): - index1 = index.copy(deep=True) - index2 = index.copy(deep=True) - result = sdc_func(index1) - result_ref = test_impl(index2) - pd.testing.assert_index_equal(result, result_ref) - - @unittest.skip("Needs writable native struct type members and single common type for name") - def test_indexes_unnamed_set_name(self): - def test_impl(index): - index.name = 'def' - return index - sdc_func = self.jit(test_impl) - - n = 11 - indexes_to_test = [ - pd.RangeIndex(n), - pd.Int64Index(np.arange(n)), - ] - - for index in indexes_to_test: - with self.subTest(index_type=type(index)): - index1 = index.copy(deep=True) - index2 = index.copy(deep=True) - result = sdc_func(index1) - result_ref = test_impl(index2) - pd.testing.assert_index_equal(result, result_ref) - - @unittest.skip("Need support unboxing pandas indexes with parent ref") - def test_indexes_operator_is_unbox(self): - def test_impl(index1, index2): - return index1 is index2 - sdc_func = self.jit(test_impl) - - indexes_to_test = [ - pd.RangeIndex(1, 21, 3), - pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), - ] - - for index in indexes_to_test: - # positive testcase - with self.subTest(subtest="same indexes"): - index1 = index.copy(deep=True) - index2 = index1 - result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) - self.assertEqual(result, result_ref) - self.assertEqual(result, True) - - # negative testcase - with self.subTest(subtest="not same indexes"): - index1 = index.copy(deep=True) - index2 = index.copy(deep=True) - result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) - self.assertEqual(result, result_ref) - self.assertEqual(result, False) - - def test_indexes_unbox_series_with_index(self): - @self.jit - def test_impl(S): - # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, - # but this requires separate type (e.g. DefaultIndexType) instead of types.none as default index - return S.index - - n = 11 - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(series_index=index): - S = pd.Series(np.ones(n), index=index) - result = test_impl(S) - self.assert_indexes_equal(result, expected_res) - - def test_indexes_create_series_with_index(self): - @self.jit - def test_impl(data, index): - S = pd.Series(data=data, index=index) - return S.index - - n = 11 - series_data = np.ones(n) - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(series_index=index): - result = test_impl(series_data, index) - self.assert_indexes_equal(result, expected_res) - - def test_indexes_box_series_with_index(self): - def test_impl(data, index): - return pd.Series(data=data, index=index) - sdc_func = self.jit(test_impl) - - n = 11 - series_data = np.ones(n) - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - result = sdc_func(series_data, index) - result_ref = test_impl(series_data, index) - pd.testing.assert_series_equal(result, result_ref) - - def test_indexes_index_get_series_index(self): - def test_impl(S): - return S.index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - S = pd.Series(np.ones(n), index=index) - result = sdc_func(S) - result_ref = test_impl(S) - self.assert_indexes_equal(result, result_ref) - - def test_indexes_index_unbox_df_with_index(self): - @self.jit - def test_impl(df): - # TO-DO: this actually includes calling 'index' attribute overload, should really be df._index, - # but this requires separate type (e.g. DefaultIndexType) instead of types.none as default index - return df.index - - n = 11 - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(df_index=index): - df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}, index=index) - result = test_impl(df) - self.assert_indexes_equal(result, expected_res) - - def test_indexes_index_create_df_with_index(self): - @self.jit - def test_impl(A, B, index): - df = pd.DataFrame({'A': A, 'B': B}, index=index) - return df.index - - n = 11 - A, B = np.ones(n), np.arange(n) - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(df_index=index): - result = test_impl(A, B, index) - self.assert_indexes_equal(result, expected_res) - - def test_indexes_index_box_df_with_index(self): - def test_impl(A, B, index): - return pd.DataFrame({'A': A, 'B': B}, index=index) - sdc_func = self.jit(test_impl) - - n = 11 - A, B = np.ones(n), np.arange(n, dtype=np.intp) - for index in _generate_index_param_values(n): - with self.subTest(df_index=index): - result = sdc_func(A, B, index) - result_ref = test_impl(A, B, index) - pd.testing.assert_frame_equal(result, result_ref) - - def test_indexes_index_get_df_index(self): - def test_impl(df): - return df.index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_index_param_values(n): - with self.subTest(df_index=index): - df = pd.DataFrame({'A': np.ones(n)}, index=index) - result = sdc_func(df) - result_ref = test_impl(df) - self.assert_indexes_equal(result, result_ref) - - -if __name__ == "__main__": - unittest.main() diff --git a/sdc/tests/indexes/test_int64_index.py b/sdc/tests/indexes/test_int64_index.py deleted file mode 100644 index 875d6e6dc..000000000 --- a/sdc/tests/indexes/test_int64_index.py +++ /dev/null @@ -1,583 +0,0 @@ -# -*- coding: utf-8 -*- -# ***************************************************************************** -# Copyright (c) 2020, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -import numpy as np -import pandas as pd -import unittest -from itertools import (combinations_with_replacement, product, ) - -from sdc.tests.indexes.index_datagens import ( - test_global_index_names, - _generate_valid_int64_index_data, - _generate_int64_indexes_fixed, - ) -from sdc.tests.test_base import TestCase - - -class TestInt64Index(TestCase): - - def test_int64_index_create_and_box(self): - def test_impl(data, name): - return pd.Int64Index(data, name=name) - sdc_func = self.jit(test_impl) - - name = 'index' - for data in _generate_valid_int64_index_data(): - with self.subTest(index_data=data): - result = sdc_func(data, name) - result_ref = test_impl(data, name) - pd.testing.assert_index_equal(result, result_ref) - - def test_int64_index_unbox_and_box(self): - def test_impl(index): - return index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_int64_indexes_fixed(n): - with self.subTest(index=index): - result = sdc_func(index) - result_ref = test_impl(index) - pd.testing.assert_index_equal(result, result_ref) - - def test_int64_index_create_param_copy_true(self): - def test_impl(arr): - return pd.Int64Index(arr, copy=True) - sdc_func = self.jit(test_impl) - - index_data_to_test = [ - np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64), - list(np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64)), - pd.RangeIndex(11), - pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), - ] - - for index_data in index_data_to_test: - with self.subTest(index_data=index_data): - result = sdc_func(index_data) - result_ref = test_impl(index_data) - pd.testing.assert_index_equal(result, result_ref) - self.assertEqual(result._data is result_ref._data, False) - - def test_int64_index_create_param_copy_default(self): - def test_impl(arr): - return pd.Int64Index(arr) - sdc_func = self.jit(test_impl) - - # only test data that has underlying array that can be referenced - # and ensure it has int64 dtype as otherwise there will always be a copy - index_data_to_test = [ - np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64), - pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), - ] - - for index_data in index_data_to_test: - with self.subTest(index_data=index_data): - result = sdc_func(index_data) - result_ref = test_impl(index_data) - pd.testing.assert_index_equal(result, result_ref) - self.assertEqual(result._data is result_ref._data, True) - - def test_int64_index_create_param_dtype(self): - def test_impl(n, dtype): - return pd.Int64Index(np.arange(n), dtype=dtype) - sdc_func = self.jit(test_impl) - - n = 11 - supported_dtypes = [None, np.int64, 'int64', np.int32, 'int32'] - for dtype in supported_dtypes: - with self.subTest(dtype=dtype): - result = sdc_func(n, dtype) - result_ref = test_impl(n, dtype) - pd.testing.assert_index_equal(result, result_ref) - - def test_int64_index_create_param_dtype_invalid(self): - def test_impl(n, dtype): - return pd.Int64Index(np.arange(n), dtype=dtype) - sdc_func = self.jit(test_impl) - - n = 11 - invalid_dtypes = ['float', 'uint'] - for dtype in invalid_dtypes: - with self.subTest(dtype=dtype): - with self.assertRaises(Exception) as context: - test_impl(n, dtype) - pandas_exception = context.exception - - with self.assertRaises(type(pandas_exception)) as context: - sdc_func(n, dtype) - sdc_exception = context.exception - self.assertIn(str(sdc_exception), str(pandas_exception)) - - def test_int64_index_attribute_dtype(self): - def test_impl(index): - return index.dtype - sdc_func = self.jit(test_impl) - - n = 11 - index = pd.Int64Index(np.arange(n) * 2) - result = sdc_func(index) - result_ref = test_impl(index) - self.assertEqual(result, result_ref) - - def test_int64_index_attribute_name(self): - def test_impl(index): - return index.name - sdc_func = self.jit(test_impl) - - n = 11 - index_data = np.arange(n) * 2 - for name in test_global_index_names: - with self.subTest(name=name): - index = pd.Int64Index(index_data, name=name) - result = sdc_func(index) - result_ref = test_impl(index) - self.assertEqual(result, result_ref) - - def test_int64_index_len(self): - def test_impl(index): - return len(index) - sdc_func = self.jit(test_impl) - - n = 11 - index = pd.Int64Index(np.arange(n) * 2, name='index') - result = sdc_func(index) - result_ref = test_impl(index) - self.assertEqual(result, result_ref) - - def test_int64_index_attribute_values(self): - def test_impl(index): - return index.values - sdc_func = self.jit(test_impl) - - for data in _generate_valid_int64_index_data(): - index = pd.Int64Index(data) - with self.subTest(index_data=data): - result = sdc_func(index) - result_ref = test_impl(index) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_contains(self): - def test_impl(index, value): - return value in index - sdc_func = self.jit(test_impl) - - index = pd.Int64Index([1, 11, 2]) - values_to_test = [-5, 15, 1, 11, 5, 6] - for value in values_to_test: - with self.subTest(value=value): - result = sdc_func(index, value) - result_ref = test_impl(index, value) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_copy(self): - def test_impl(index, new_name): - return index.copy(name=new_name) - sdc_func = self.jit(test_impl) - - for data in _generate_valid_int64_index_data(): - for name, new_name in product(test_global_index_names, repeat=2): - index = pd.Int64Index(data, name=name) - with self.subTest(index=index, new_name=new_name): - result = sdc_func(index, new_name) - result_ref = test_impl(index, new_name) - pd.testing.assert_index_equal(result, result_ref) - - def test_int64_index_copy_param_deep(self): - def test_impl(index, deep): - return index.copy(deep=deep) - sdc_func = self.jit(test_impl) - - index = pd.Int64Index([1, 11, 2]) - for deep in [True, False]: - with self.subTest(deep=deep): - result = sdc_func(index, deep) - result_ref = test_impl(index, deep) - pd.testing.assert_index_equal(result, result_ref) - # pandas uses ndarray views when copies index, so for python - # case check that data arrays share the same memory - self.assertEqual( - result._data is index._data, - result_ref._data.base is index._data - ) - - def test_int64_index_getitem_scalar(self): - def test_impl(index, idx): - return index[idx] - sdc_func = self.jit(test_impl) - - for data in _generate_valid_int64_index_data(): - index = pd.Int64Index(data) - n = len(index) - values_to_test = [-n, n // 2, n - 1] - for idx in values_to_test: - with self.subTest(index=index, idx=idx): - result = sdc_func(index, idx) - result_ref = test_impl(index, idx) - self.assertEqual(result, result_ref) - - def test_int64_index_getitem_scalar_idx_bounds(self): - def test_impl(index, idx): - return index[idx] - sdc_func = self.jit(test_impl) - - n = 11 - index = pd.Int64Index(np.arange(n) * 2, name='abc') - values_to_test = [-(n + 1), n] - for idx in values_to_test: - with self.subTest(idx=idx): - with self.assertRaises(Exception) as context: - test_impl(index, idx) - pandas_exception = context.exception - - with self.assertRaises(type(pandas_exception)) as context: - sdc_func(index, idx) - sdc_exception = context.exception - self.assertIsInstance(sdc_exception, type(pandas_exception)) - self.assertIn("out of bounds", str(sdc_exception)) - - def test_int64_index_getitem_slice(self): - def test_impl(index, idx): - return index[idx] - sdc_func = self.jit(test_impl) - - index_len = 11 - slices_params = combinations_with_replacement( - [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], - 3 - ) - - for data in _generate_valid_int64_index_data(): - for slice_start, slice_stop, slice_step in slices_params: - # slice step cannot be zero - if slice_step == 0: - continue - - idx = slice(slice_start, slice_stop, slice_step) - index = pd.Int64Index(data, name='abc') - with self.subTest(index=index, idx=idx): - result = sdc_func(index, idx) - result_ref = test_impl(index, idx) - pd.testing.assert_index_equal(result, result_ref) - - def test_int64_index_iterator_1(self): - def test_impl(index): - res = [] - for i, label in enumerate(index): - res.append((i, label)) - return res - sdc_func = self.jit(test_impl) - - index = pd.Int64Index([5, 3, 2, 1, 7, 4]) - result = sdc_func(index) - result_ref = test_impl(index) - self.assertEqual(result, result_ref) - - def test_int64_index_iterator_2(self): - def test_impl(index): - res = [] - for label in index: - if not label % 2: - res.append(label) - return res - sdc_func = self.jit(test_impl) - - index = pd.Int64Index([5, 3, 2, 1, 7, 4]) - result = sdc_func(index) - result_ref = test_impl(index) - self.assertEqual(result, result_ref) - - def test_int64_index_nparray(self): - def test_impl(index): - return np.array(index) - sdc_func = self.jit(test_impl) - - n = 11 - index = pd.Int64Index(np.arange(n) * 2) - result = sdc_func(index) - result_ref = test_impl(index) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_operator_eq_index(self): - def test_impl(index1, index2): - return index1 == index2 - sdc_func = self.jit(test_impl) - - n = 11 - for index1, index2 in product(_generate_int64_indexes_fixed(n), repeat=2): - with self.subTest(index1=index1, index2=index2): - result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray - result_ref = test_impl(index1, index2) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_operator_eq_scalar(self): - def test_impl(A, B): - return A == B - sdc_func = self.jit(test_impl) - - n = 11 - A = pd.Int64Index(np.arange(n) * 2) - scalars_to_test = [0, 22, 13, -5, 4.0] - for B in scalars_to_test: - for swap_operands in (False, True): - if swap_operands: - A, B = B, A - with self.subTest(left=A, right=B): - result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray - result_ref = test_impl(A, B) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_operator_eq_nparray(self): - def test_impl(A, B): - return A == B - sdc_func = self.jit(test_impl) - - n = 11 - for A, B in product( - _generate_int64_indexes_fixed(n), - map(lambda x: np.array(x), _generate_int64_indexes_fixed(n)) - ): - for swap_operands in (False, True): - if swap_operands: - A, B = B, A - with self.subTest(left=A, right=B): - result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray - result_ref = test_impl(A, B) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_operator_ne_index(self): - def test_impl(index1, index2): - return index1 != index2 - sdc_func = self.jit(test_impl) - - n = 11 - for index1, index2 in product(_generate_int64_indexes_fixed(n), repeat=2): - with self.subTest(index1=index1, index2=index2): - result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray - result_ref = test_impl(index1, index2) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_operator_is_nounbox(self): - def test_impl_1(data): - index1 = pd.Int64Index(data) - index2 = index1 - return index1 is index2 - sdc_func_1 = self.jit(test_impl_1) - - def test_impl_2(data): - index1 = pd.Int64Index(data) - index2 = pd.Int64Index(data) - return index1 is index2 - sdc_func_2 = self.jit(test_impl_2) - - # positive testcase - index_data = [1, 2, 3, 5, 6, 3, 4] - with self.subTest(subtest="same indexes"): - result = sdc_func_1(index_data) - result_ref = test_impl_1(index_data) - self.assertEqual(result, result_ref) - self.assertEqual(result, True) - - # negative testcase - with self.subTest(subtest="not same indexes"): - result = sdc_func_2(index_data) - result_ref = test_impl_2(index_data) - self.assertEqual(result, result_ref) - self.assertEqual(result, False) - - def test_int64_index_getitem_by_mask(self): - def test_impl(index, mask): - return index[mask] - sdc_func = self.jit(test_impl) - - n = 11 - np.random.seed(0) - mask = np.random.choice([True, False], n) - for index in _generate_int64_indexes_fixed(n): - result = sdc_func(index, mask) - result_ref = test_impl(index, mask) - pd.testing.assert_index_equal(result, result_ref) - - def test_int64_index_support_reindexing(self): - from sdc.datatypes.common_functions import sdc_reindex_series - - def pyfunc(data, index, name, by_index): - S = pd.Series(data, index, name=name) - return S.reindex(by_index) - - @self.jit - def sdc_func(data, index, name, by_index): - return sdc_reindex_series(data, index, name, by_index) - - n = 10 - np.random.seed(0) - mask = np.random.choice([True, False], n) - name = 'asdf' - index1 = pd.Int64Index(np.arange(n)) - index2 = pd.Int64Index(np.arange(n))[::-1] - result = sdc_func(mask, index1, name, index2) - result_ref = pyfunc(mask, index1, name, index2) - pd.testing.assert_series_equal(result, result_ref) - - def test_int64_index_support_join(self): - from sdc.datatypes.common_functions import sdc_join_series_indexes - - def pyfunc(index1, index2): - return index1.join(index2, how='outer', return_indexers=True) - - @self.jit - def sdc_func(index1, index2): - return sdc_join_series_indexes(index1, index2) - - index1 = pd.Int64Index(np.arange(-5, 5, 1), name='asv') - index2 = pd.Int64Index(np.arange(0, 10, 2), name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - results_names = ['result index', 'left indexer', 'right indexer'] - for i, name in enumerate(results_names): - result_elem = result[i] - result_ref_elem = result_ref[i].values if not i else result_ref[i] - np.testing.assert_array_equal(result_elem, result_ref_elem, f"Mismatch in {name}") - - def test_int64_index_support_take_from(self): - from sdc.datatypes.common_functions import _sdc_take - - def pyfunc(index1, indexes): - return index1.values.take(indexes) - - @self.jit - def sdc_func(index1, indexes): - return _sdc_take(index1, indexes) - - n, k = 1000, 200 - np.random.seed(0) - index = pd.Int64Index(np.arange(n) * 2, name='asd') - indexes = np.random.choice(np.arange(n), n)[:k] - result = sdc_func(index, indexes) - result_ref = pyfunc(index, indexes) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_support_take_by(self): - from sdc.datatypes.common_functions import _sdc_take - - def pyfunc(arr, index): - return np.take(arr, index) - - @self.jit - def sdc_func(arr, index): - return _sdc_take(arr, index) - - n, k = 1000, 200 - np.random.seed(0) - arr = np.arange(n) * 2 - index = pd.Int64Index(np.random.choice(np.arange(n), n)[:k]) - result = sdc_func(arr, index) - result_ref = pyfunc(arr, index) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_support_astype(self): - from sdc.functions.numpy_like import astype - - def pyfunc(index): - return index.values.astype(np.int64) - - @self.jit - def sdc_func(index): - return astype(index, np.int64) - - n = 100 - index = pd.Int64Index(np.arange(n) * 2, name='asd') - np.testing.assert_array_equal(sdc_func(index), pyfunc(index)) - - def test_int64_index_support_array_equal(self): - from sdc.functions.numpy_like import array_equal - - def pyfunc(index1, index2): - return np.array_equal(index1.values, index2.values) - - @self.jit - def sdc_func(index1, index2): - return array_equal(index1, index2) - - n = 11 - indexes_to_test = [ - pd.Int64Index(np.arange(n)), - pd.Int64Index(np.arange(n), name='asd'), - pd.Int64Index(np.arange(n) * 2, name='asd'), - pd.Int64Index(np.arange(2 * n)), - ] - for index1, index2 in combinations_with_replacement(indexes_to_test, 2): - with self.subTest(index1=index1, index2=index2): - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - self.assertEqual(result, result_ref) - - def test_int64_index_support_copy(self): - from sdc.functions.numpy_like import copy - - @self.jit - def sdc_func(index): - return copy(index) - - for data in _generate_valid_int64_index_data(): - for name in test_global_index_names: - index = pd.Int64Index(data, name=name) - with self.subTest(index=index): - result = sdc_func(index) - pd.testing.assert_index_equal(result, index) - - def test_int64_index_support_append(self): - from sdc.datatypes.common_functions import hpat_arrays_append - - def pyfunc(index1, index2): - return index1.append(index2) - - @self.jit - def sdc_func(index1, index2): - return hpat_arrays_append(index1, index2) - - n = 11 - index1 = pd.Int64Index(np.arange(n), name='asv') - index2 = pd.Int64Index(2 * np.arange(n), name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_ravel(self): - def test_impl(index): - return index.ravel() - sdc_func = self.jit(test_impl) - - n = 11 - index = pd.Int64Index(np.arange(n) * 2) - result = sdc_func(index) - result_ref = test_impl(index) - np.testing.assert_array_equal(result, result_ref) - - -if __name__ == "__main__": - unittest.main() diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index aa90c7f23..c245f0694 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -362,30 +362,6 @@ def test_impl(n): self.assertEqual(count_parfor_REPs(), 0) self.assertEqual(count_parfor_OneDs(), 1) - @unittest.skip("Works, but compile time needs debug") - def test_column_getitem_repeats(self): - def test_impl(a, b, c): - df = pd.DataFrame({ - 'A': a, - 'B': b, - 'C': c, - }) - - A = df['A'] - B = df['B'] - C = df['C'] - return A[0] + B[0] + C[0] - sdc_func = self.jit(test_impl) - - n = 11 - np.random.seed(0) - a = np.ones(n) - b = np.random.ranf(n) - c = np.random.randint(-100, 100, n) - result = sdc_func(a, b, c) - result_ref = pd.Series(test_impl(a, b, c)) - pd.testing.assert_series_equal(result, result_ref) - @skip_numba_jit def test_column_list_getitem1(self): def test_impl(df): diff --git a/sdc/tests/test_date.py b/sdc/tests/test_date.py index 83d001349..83671cee4 100644 --- a/sdc/tests/test_date.py +++ b/sdc/tests/test_date.py @@ -81,9 +81,7 @@ def test_impl(A): hpat_func = self.jit(test_impl) df = self._gen_str_date_df() A = pd.DatetimeIndex(df['str_date']).to_series() - result = hpat_func(A) - result_ref = test_impl(A) - np.testing.assert_array_equal(result, result_ref) + np.testing.assert_array_equal(hpat_func(A), test_impl(A)) @skip_numba_jit def test_datetime_getitem(self): diff --git a/sdc/tests/test_hpat_jit.py b/sdc/tests/test_hpat_jit.py index 8644551c3..5e47f6f8e 100644 --- a/sdc/tests/test_hpat_jit.py +++ b/sdc/tests/test_hpat_jit.py @@ -36,7 +36,6 @@ from sdc import * from sdc.tests.test_base import TestCase from sdc.tests.test_utils import skip_numba_jit -from numba.experimental import jitclass class TestHpatJitIssues(TestCase): diff --git a/sdc/tests/indexes/test_range_index.py b/sdc/tests/test_indexes.py similarity index 74% rename from sdc/tests/indexes/test_range_index.py rename to sdc/tests/test_indexes.py index e9369c9f7..b277ac9a1 100644 --- a/sdc/tests/indexes/test_range_index.py +++ b/sdc/tests/test_indexes.py @@ -28,18 +28,43 @@ import numpy as np import pandas as pd import unittest -from itertools import (combinations_with_replacement, product, ) -from numba.core.errors import TypingError -from sdc.tests.indexes.index_datagens import ( - test_global_index_names, - _generate_valid_range_params, - _generate_range_indexes_fixed, - _generate_index_param_values, - ) +from itertools import (combinations_with_replacement, product, filterfalse, chain) + from sdc.tests.test_base import TestCase from sdc.utilities.sdc_typing_utils import kwsparams2list from sdc.tests.test_series import _make_func_from_text +from numba.core.errors import TypingError + + +test_global_index_names = [None, 'abc', 'index'] +test_global_range_member_values = [1, 2, 10, -5, 0, None] + + +def _generate_valid_range_params(): + + def valid_params_predicate(range_params): + # if step is zero or all start/stop/step are None range is invalid + return (range_params[-1] == 0 + or all(map(lambda x: x is None, range_params))) + + return filterfalse( + valid_params_predicate, + combinations_with_replacement(test_global_range_member_values, 3) + ) + + +def _generate_range_indexes_fixed(size, start=1, step=3): + yield pd.RangeIndex(size) + yield pd.RangeIndex(size, name='abc') + yield pd.RangeIndex(stop=step * size, step=step) + yield pd.RangeIndex(stop=2*step*size, step=2*step) + yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) + yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) + + +def _generate_index_param_values(n): + return chain([None], _generate_range_indexes_fixed(n)) class TestRangeIndex(TestCase): @@ -71,6 +96,18 @@ def test_impl(index): result_ref = test_impl(index) pd.testing.assert_index_equal(result, result_ref) + @unittest.skip("TODO: support boxing/unboxing and parent ref for Python ranges in Numba") + def test_range_index_unbox_data_id_check(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(11, name='abc') + result = sdc_func(index) + result_ref = test_impl(index) + self.assertIs(index._range, result_ref._range) + self.assertIs(result._range, result_ref._range) + @unittest.skip("TODO: add support for integers as floats in ctor") def test_range_index_create_from_floats(self): def test_impl(*args): @@ -82,7 +119,7 @@ def test_impl(*args): result_ref = test_impl(start, stop, step) pd.testing.assert_index_equal(result, result_ref) - def test_range_index_create_invalid_1(self): + def test_range_index_create_invalid1(self): def test_impl(start, stop, step): return pd.RangeIndex(start, stop, step) sdc_func = self.jit(test_impl) @@ -98,7 +135,7 @@ def test_impl(start, stop, step): sdc_exception = context.exception self.assertIn(str(sdc_exception), str(pandas_exception)) - def test_range_index_create_invalid_2(self): + def test_range_index_create_invalid2(self): def test_impl(): return pd.RangeIndex(name='index') sdc_func = self.jit(test_impl) @@ -356,6 +393,150 @@ def test_impl(index, idx): result_ref = test_impl(index, idx) pd.testing.assert_index_equal(result, result_ref) + @unittest.skip("Needs writable native struct type members in Numba") + def test_range_index_named_set_name(self): + def test_impl(index): + index.name = 'def' + return index + sdc_func = self.jit(test_impl) + + n = 11 + index1 = pd.RangeIndex(n, name='abc') + index2 = index1.copy(deep=True) + result = sdc_func(index1) + result_ref = test_impl(index2) + pd.testing.assert_index_equal(result, result_ref) + + @unittest.skip("Needs writable native struct type members and single common type for name") + def test_range_index_unnamed_set_name(self): + def test_impl(index): + index.name = 'def' + return index + sdc_func = self.jit(test_impl) + + n = 11 + index1 = pd.RangeIndex(n, name='abc') + index2 = index1.copy(deep=True) + result = sdc_func(index1) + result_ref = test_impl(index2) + pd.testing.assert_index_equal(result, result_ref) + + def _test_range_indexes(self, test_impl, indexes, size, apply_func): + for index in indexes: + expected_res = pd.RangeIndex(size) if index is None else index + with self.subTest(series_index=index): + args = apply_func(size, index) + result = test_impl(args) + pd.testing.assert_index_equal(result, expected_res) + + def test_range_index_unbox_series_with_index(self): + @self.jit + def test_impl(S): + # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, + # but this requires separate type (e.g. DefaultIndexType) instead of types.none as native index + return S.index + + n = 11 + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(series_index=index): + S = pd.Series(np.ones(n), index=index) + result = test_impl(S) + pd.testing.assert_index_equal(result, expected_res) + + def test_range_index_create_series_with_index(self): + @self.jit + def test_impl(data, index): + S = pd.Series(data=data, index=index) + return S.index + + n = 11 + series_data = np.ones(n) + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(series_index=index): + result = test_impl(series_data, index) + pd.testing.assert_index_equal(result, expected_res) + + def test_range_index_box_series_with_index(self): + def test_impl(data, index): + return pd.Series(data=data, index=index) + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.ones(n) + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + result = sdc_func(series_data, index) + result_ref = test_impl(series_data, index) + pd.testing.assert_series_equal(result, result_ref) + + def test_range_index_get_series_index(self): + def test_impl(S): + return S.index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + S = pd.Series(np.ones(n), index=index) + result = sdc_func(S) + result_ref = test_impl(S) + pd.testing.assert_index_equal(result, result_ref) + + def test_range_index_unbox_df_with_index(self): + @self.jit + def test_impl(df): + return df.index + + n = 11 + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(df_index=index): + df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}, index=index) + result = test_impl(df) + pd.testing.assert_index_equal(result, expected_res) + + def test_range_index_create_df_with_index(self): + @self.jit + def test_impl(A, B, index): + df = pd.DataFrame({'A': A, 'B': B}, index=index) + return df.index + + n = 11 + A, B = np.ones(n), np.arange(n) + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(df_index=index): + result = test_impl(A, B, index) + pd.testing.assert_index_equal(result, expected_res) + + def test_range_index_box_df_with_index(self): + def test_impl(A, B, index): + return pd.DataFrame({'A': A, 'B': B}, index=index) + sdc_func = self.jit(test_impl) + + n = 11 + A, B = np.ones(n), np.arange(n, dtype=np.intp) + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + result = sdc_func(A, B, index) + result_ref = test_impl(A, B, index) + pd.testing.assert_frame_equal(result, result_ref) + + def test_range_index_get_df_index(self): + def test_impl(df): + return df.index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + df = pd.DataFrame({'A': np.ones(n)}, index=index) + result = sdc_func(df) + result_ref = test_impl(df) + pd.testing.assert_index_equal(result, result_ref) + def test_range_index_iterator_1(self): def test_impl(index): res = [] @@ -479,7 +660,29 @@ def test_impl(index1, index2): result_ref = test_impl(index1, index2) np.testing.assert_array_equal(result, result_ref) - def test_range_index_operator_is_nounbox(self): + @unittest.skip("Need support unboxing Python range in Numba with parent ref") + def test_range_index_operator_is_1(self): + def test_impl(index1, index2): + return index1 is index2 + sdc_func = self.jit(test_impl) + + # positive testcase + with self.subTest(subtest="same indexes"): + index1 = pd.RangeIndex(1, 21, 3) + index2 = index1 + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + # negative testcase + with self.subTest(subtest="not same indexes"): + index1 = pd.RangeIndex(1, 21, 3) + index2 = pd.RangeIndex(1, 21, 3) + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + def test_range_index_operator_is_2(self): def test_impl_1(*args): index1 = pd.RangeIndex(*args) index2 = index1 @@ -498,14 +701,12 @@ def test_impl_2(*args): result = sdc_func_1(*params) result_ref = test_impl_1(*params) self.assertEqual(result, result_ref) - self.assertEqual(result, True) # negative testcase with self.subTest(subtest="not same indexes"): result = sdc_func_2(*params) result_ref = test_impl_2(*params) self.assertEqual(result, result_ref) - self.assertEqual(result, False) def test_range_index_getitem_by_mask(self): def test_impl(index, mask): @@ -518,7 +719,8 @@ def test_impl(index, mask): for index in _generate_range_indexes_fixed(n): result = sdc_func(index, mask) result_ref = test_impl(index, mask) - pd.testing.assert_index_equal(result, result_ref) + # FIXME: replace with pd.testing.assert_index_equal when Int64Index is supported + np.testing.assert_array_equal(result, result_ref.values) def test_range_index_support_reindexing(self): from sdc.datatypes.common_functions import sdc_reindex_series @@ -625,34 +827,6 @@ def sdc_func(index): result = sdc_func(index) pd.testing.assert_index_equal(result, index) - def test_range_index_support_append(self): - from sdc.datatypes.common_functions import hpat_arrays_append - - def pyfunc(index1, index2): - return index1.append(index2) - - @self.jit - def sdc_func(index1, index2): - return hpat_arrays_append(index1, index2) - - n = 11 - index1 = pd.RangeIndex(1, 21, 3, name='asv') - index2 = pd.RangeIndex(19, -1, -3, name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - np.testing.assert_array_equal(result, result_ref) - - def test_range_index_ravel(self): - def test_impl(index): - return index.ravel() - sdc_func = self.jit(test_impl) - - n = 11 - index = pd.RangeIndex(n) - result = sdc_func(index) - result_ref = test_impl(index) - np.testing.assert_array_equal(result, result_ref) - if __name__ == "__main__": unittest.main() diff --git a/sdc/tests/test_rolling.py b/sdc/tests/test_rolling.py index 27f625b42..44b50370c 100644 --- a/sdc/tests/test_rolling.py +++ b/sdc/tests/test_rolling.py @@ -1149,6 +1149,7 @@ def test_impl(df, other, pairwise): hpat_func(df, other, True) self.assertIn(msg_tmpl.format('False, None'), str(raises.exception)) + @unittest.expectedFailure def test_df_rolling_cov_issue_floating_point_rounding(self): """ Cover issue of different float rounding in Python and SDC/Numba: diff --git a/sdc/tests/test_sdc_numpy.py b/sdc/tests/test_sdc_numpy.py index 9b7a74846..d6bda23db 100644 --- a/sdc/tests/test_sdc_numpy.py +++ b/sdc/tests/test_sdc_numpy.py @@ -387,45 +387,6 @@ def run_test(ref_impl, sdc_impl, data, kind): with self.subTest(data=case, kind=kind, size=len(int_array)): run_test(ref_impl, sdc_func, data, kind) - def test_argsort_param_ascending(self): - - def ref_impl(a, kind, ascending): - return pd.Series(a).sort_values(kind=kind, ascending=ascending).index - - def sdc_impl(a, kind, ascending): - return numpy_like.argsort(a, kind=kind, ascending=ascending) - - def run_test(ref_impl, sdc_impl, data, kind, ascending): - if kind == 'mergesort': - np.testing.assert_array_equal( - ref_impl(data, kind, ascending), - sdc_func(data, kind, ascending)) - else: - sorted_ref = data[ref_impl(data, kind, ascending)] - sorted_sdc = data[sdc_impl(data, kind, ascending)] - np.testing.assert_array_equal(sorted_ref, sorted_sdc) - - sdc_func = self.jit(sdc_impl) - - n = 100 - np.random.seed(0) - data_values = { - 'float': [np.inf, np.NINF, np.nan, 0, -1, 2.1, 2/3, -3/4, 0.777], - 'int': [1, -1, 3, 5, -60, 21, 22, 23], - } - all_dtypes = { - 'float': ['float32', 'float64'], - 'int': ['int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64'] - } - - for kind, ascending in product([None, 'quicksort', 'mergesort'], [True, False]): - for dtype_group, arr_values in data_values.items(): - for dtype in all_dtypes[dtype_group]: - data = np.random.choice(arr_values, n).astype(dtype) - with self.subTest(data=data, kind=kind, ascending=ascending): - run_test(ref_impl, sdc_func, data, kind, ascending) - - def _test_fillna_numeric(self, pyfunc, cfunc, inplace): data_to_test = [ [True, False, False, True, True], diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index ec829aa60..3c5db9c1f 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -33,7 +33,7 @@ import sdc import string import unittest -from itertools import combinations, combinations_with_replacement, islice, permutations, product +from itertools import combinations, combinations_with_replacement, product import numba from numba import types from numba.core.config import IS_32BITS @@ -323,7 +323,24 @@ def test_impl(n): n = 11 pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) - def test_create_series_param_name_literal(self): + def test_create_series_index1(self): + # create and box an indexed Series + def test_impl(): + A = pd.Series([1, 2, 3], ['A', 'C', 'B']) + return A + hpat_func = self.jit(test_impl) + + pd.testing.assert_series_equal(hpat_func(), test_impl()) + + def test_create_series_index2(self): + def test_impl(): + A = pd.Series([1, 2, 3], index=[2, 1, 0]) + return A + hpat_func = self.jit(test_impl) + + pd.testing.assert_series_equal(hpat_func(), test_impl()) + + def test_create_series_index3(self): def test_impl(): A = pd.Series([1, 2, 3], index=['A', 'C', 'B'], name='A') return A @@ -331,7 +348,7 @@ def test_impl(): pd.testing.assert_series_equal(hpat_func(), test_impl()) - def test_create_series_param_name(self): + def test_create_series_index4(self): def test_impl(name): A = pd.Series([1, 2, 3], index=['A', 'C', 'B'], name=name) return A @@ -359,7 +376,7 @@ def test_impl(A): S = pd.Series(['a', 'b', 'c'], name='A') self.assertEqual(hpat_func(S), test_impl(S)) - def test_pass_series_all_indexes(self): + def test_pass_series_index1(self): def test_impl(A): return A hpat_func = self.jit(test_impl) @@ -370,7 +387,6 @@ def test_impl(A): list(np.arange(n)), np.arange(n), pd.RangeIndex(n), - pd.Int64Index(np.arange(n)), gen_strlist(n) ] for index in indexes_to_test: @@ -2138,26 +2154,16 @@ def test_series_value_counts_numeric_dropna_false(self): def test_impl(S): return S.value_counts(dropna=False) - data_to_test = [ - [1, 2, 3, 1, 1, 3], - [1, 2, 3, np.nan, 1, 3, np.nan, np.inf], - [0.1, 3., np.nan, 3., 0.1, 3., np.nan, np.inf, 0.1, 0.1] - ] + data_to_test = [[1, 2, 3, 1, 1, 3], + [1, 2, 3, np.nan, 1, 3, np.nan, np.inf], + [0.1, 3., np.nan, 3., 0.1, 3., np.nan, np.inf, 0.1, 0.1]] hpat_func = self.jit(test_impl) for data in data_to_test: with self.subTest(series_data=data): S = pd.Series(data) - result = hpat_func(S) - result_ref = test_impl(S) - - # order within groups of same counts may be different since - # pandas impl uses sort_values() with default kind='quicksort' - pd.testing.assert_series_equal( - result.sort_index(), - result_ref.sort_index() - ) + pd.testing.assert_series_equal(hpat_func(S), test_impl(S)) def test_series_value_counts_str_dropna_false(self): def test_impl(S): @@ -2200,15 +2206,13 @@ def test_series_value_counts_index(self): def test_impl(S): return S.value_counts() - sdc_func = self.jit(test_impl) + hpat_func = self.jit(test_impl) for data in test_global_input_data_integer64: - index = np.arange(start=1, stop=len(data) + 1) with self.subTest(series_data=data): + index = np.arange(start=1, stop=len(data) + 1) S = pd.Series(data, index=index) - result = sdc_func(S) - result_ref = test_impl(S) - pd.testing.assert_series_equal(result.sort_index(), result_ref.sort_index()) + pd.testing.assert_series_equal(hpat_func(S).sort_index(), test_impl(S).sort_index()) def test_series_value_counts_no_unboxing(self): def test_impl(): @@ -4033,17 +4037,17 @@ def test_impl(series, ascending, kind): for data in all_data: series = pd.Series(data * 3) - for ascending, kind in product([True, False], ['quicksort', 'mergesort']): - with self.subTest(data=data, ascending=ascending, kind=kind): - result = hpat_func(series, ascending, kind=kind) - result_ref = test_impl(series, ascending, kind=kind) + for ascending in [True, False]: + for kind in ['quicksort', 'mergesort']: + ref_result = test_impl(series, ascending, kind=kind) + jit_result = hpat_func(series, ascending, kind=kind) + ref = restore_series_sort_values(series, ref_result.index, ascending) + jit = restore_series_sort_values(series, jit_result.index, ascending) if kind == 'mergesort': - pd.testing.assert_series_equal(result, result_ref) + pd.testing.assert_series_equal(ref_result, jit_result) else: - np.testing.assert_array_equal(result.values, result_ref.values) - jit = restore_series_sort_values(series, result.index, ascending) - ref = restore_series_sort_values(series, result_ref.index, ascending) - self.assertEqual(jit, ref) + np.testing.assert_array_equal(ref_result.values, jit_result.values) + self.assertEqual(ref, jit) @skip_parallel def test_series_sort_values_full_idx(self): @@ -5326,7 +5330,6 @@ def test_impl(A, i, value): test_impl(S2, idx, value) pd.testing.assert_series_equal(S1, S2) - @unittest.expectedFailure # FIXME_Pandas#37427 (since pandas=1.1 setitem does diff things for diff dtypes) def test_series_setitem_idx_str_series(self): """ Verifies Series.setitem for idx operand of type pandas.Series and string dtype called on integer Series with index of matching dtype and scalar and non scalar assigned values """ @@ -5342,7 +5345,6 @@ def test_series_setitem_idx_str_series(self): pd.Series(assigned_values)] self._test_series_setitem([series_data], [series_index], [idx], values_to_test, np.intp) - @unittest.expectedFailure # FIXME_Pandas#37427 (since pandas=1.1 setitem does diff things for diff dtypes) def test_series_setitem_idx_float_series(self): """ Verifies Series.setitem for idx operand of type pandas.Series and float dtype called on integer Series with index of matching dtype and scalar and non scalar assigned values """ diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 1d489d17f..81bc81c31 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -40,17 +40,6 @@ from sdc.str_arr_type import string_array_type from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType -from sdc.str_arr_ext import StringArrayType - - -sdc_pandas_index_types = ( - types.NoneType, - types.Array, - StringArrayType, - RangeIndexType, - Int64IndexType, - ) class TypeChecker: @@ -149,7 +138,7 @@ def check_is_numeric_array(type_var): def check_index_is_numeric(ty_series): """Used during typing to check that series has numeric index""" - return isinstance(ty_series.index.dtype, types.Number) + return check_is_numeric_array(ty_series.index) def check_types_comparable(ty_left, ty_right): @@ -207,7 +196,6 @@ def find_index_common_dtype(self, other): return index_dtypes_match, numba_index_common_dtype - def gen_impl_generator(codegen, impl_name): """Generate generator of an implementation""" def _df_impl_generator(*args, **kwargs): @@ -220,7 +208,3 @@ def _df_impl_generator(*args, **kwargs): return _impl return _df_impl_generator - - -def check_signed_integer(ty): - return isinstance(ty, types.Integer) and ty.signed diff --git a/setup.py b/setup.py index 903725410..d4c8f1fab 100644 --- a/setup.py +++ b/setup.py @@ -375,9 +375,9 @@ def run(self): package_data={'sdc.tests': ['*.bz2'], }, install_requires=[ 'numpy>=1.16', - 'pandas>=1.2.0', + 'pandas>=1.0', 'pyarrow==0.17.0', - 'numba>=0.52.0,<0.53', + 'numba>=0.51.2,<0.52', 'tbb' ], cmdclass=sdc_build_commands, From 7e83e15553395a420ec5bdd55fc013cc38084f86 Mon Sep 17 00:00:00 2001 From: Alexander Rybkin <38352652+xaleryb@users.noreply.github.com> Date: Wed, 24 Feb 2021 13:07:37 +0300 Subject: [PATCH 4/4] PyArrow version updated to 2.0.0 (#965) * Update meta.yaml * Updating more files referring to pyarrow version Co-authored-by: Kozlov, Alexey --- README.rst | 8 ++++---- conda-recipe/meta.yaml | 2 +- docs/source/getting_started.rst | 4 ++-- requirements.txt | 2 +- setup.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index a5f18b8a9..fc4e8abef 100644 --- a/README.rst +++ b/README.rst @@ -34,13 +34,13 @@ Distribution includes Intel® SDC for Python 3.6 and Python 3.7 for Windows and Intel® SDC conda package can be installed using the steps below:: - > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=2.0.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc-env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel® SDC wheel package can be installed using the steps below:: - > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=2.0.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc-env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc @@ -82,7 +82,7 @@ Building on Linux with setuptools export PYVER=<3.6 or 3.7> export NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64 + conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=2.0.0 gcc_linux-64 gxx_linux-64 source activate sdc-env git clone https://github.com/IntelPython/sdc.git cd sdc @@ -120,7 +120,7 @@ Building on Windows with setuptools set PYVER=<3.6 or 3.7> set NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 + conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=2.0.0 conda activate sdc-env set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include set LIB=%LIB%;%CONDA_PREFIX%\Library\lib diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index bd95dbc9d..17189b818 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,6 +1,6 @@ {% set NUMBA_VERSION = "==0.52.0" %} {% set PANDAS_VERSION = "==1.2.0" %} -{% set PYARROW_VERSION = "==0.17.0" %} +{% set PYARROW_VERSION = "==2.0.0" %} package: name: sdc diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index b0fcc0182..2156c2214 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla Intel SDC conda package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=2.0.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc_env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel SDC wheel package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=2.0.0 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc_env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc diff --git a/requirements.txt b/requirements.txt index f3016c49e..4e7e3940c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy>=1.16 pandas==1.2.0 -pyarrow==0.17.0 +pyarrow==2.0.0 numba==0.52.0 tbb tbb-devel diff --git a/setup.py b/setup.py index 903725410..7a3b60388 100644 --- a/setup.py +++ b/setup.py @@ -376,7 +376,7 @@ def run(self): install_requires=[ 'numpy>=1.16', 'pandas>=1.2.0', - 'pyarrow==0.17.0', + 'pyarrow==2.0.0', 'numba>=0.52.0,<0.53', 'tbb' ],