Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set NUMBA_VERSION = "==0.52.0" %}
{% set NUMBA_VERSION = "==0.53.1" %}
{% set PANDAS_VERSION = "==1.2.0" %}
{% set PYARROW_VERSION = "==2.0.0" %}

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy>=1.16
pandas==1.2.0
pyarrow==2.0.0
numba==0.52.0
numba==0.53.1
tbb
tbb-devel
2 changes: 2 additions & 0 deletions sdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
import sdc.extensions.indexes.range_index_ext
import sdc.extensions.indexes.int64_index_ext

import sdc.extensions.sdc_hashmap_ext

from ._version import get_versions

"""
Expand Down
92 changes: 53 additions & 39 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,13 +459,15 @@ def _series_getitem_idx_bool_indexer_impl(self, idx):
if (isinstance(idx, SeriesType) and index_is_positional
and not isinstance(idx.data.dtype, (types.Boolean, bool))):
def hpat_pandas_series_getitem_idx_list_impl(self, idx):
res = numpy.copy(self._data[:len(idx._data)])
index = numpy.arange(len(self._data))
idx_data = idx._data
self_data = self._data
res = numpy.copy(self._data[:len(idx_data)])
index = numpy.arange(len(self_data))
for i in numba.prange(len(res)):
for j in numba.prange(len(index)):
if j == idx._data[i]:
res[i] = self._data[j]
return pandas.Series(data=res, index=index[idx._data], name=self._name)
if j == idx_data[i]:
res[i] = self_data[j]
return pandas.Series(data=res, index=index[idx_data], name=self._name)
return hpat_pandas_series_getitem_idx_list_impl

# idx is Series and it's index is not PositionalIndex, idx.dtype is not Boolean
Expand Down Expand Up @@ -647,6 +649,7 @@ def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value):

def sdc_pandas_series_setitem_idx_bool_array_align_impl(self, idx, value):

series_data = self._data # FIXME_Numba#6960
# if idx is a Boolean array (and value is a series) it's used as a mask for self.index
# and filtered indexes are looked in value.index, and if found corresponding value is set
if value_is_series == True: # noqa
Expand All @@ -659,7 +662,7 @@ def sdc_pandas_series_setitem_idx_bool_array_align_impl(self, idx, value):
self_index_has_duplicates = len(unique_self_indices) != len(self_index)
value_index_has_duplicates = len(unique_value_indices) != len(value_index)
if (self_index_has_duplicates or value_index_has_duplicates):
self._data[idx] = value._data
series_data[idx] = value._data
else:
map_index_to_position = Dict.empty(
key_type=indexes_common_dtype,
Expand All @@ -674,13 +677,13 @@ def sdc_pandas_series_setitem_idx_bool_array_align_impl(self, idx, value):
if idx[i]:
self_index_value = self_index[i]
if self_index_value in map_index_to_position:
self._data[i] = value._data[map_index_to_position[self_index_value]]
series_data[i] = value._data[map_index_to_position[self_index_value]]
else:
sdc.hiframes.join.setitem_arr_nan(self._data, i)
sdc.hiframes.join.setitem_arr_nan(series_data, i)

else:
# if value has no index - nothing to reindex and assignment is made along positions set by idx mask
self._data[idx] = value
series_data[idx] = value

return self

Expand Down Expand Up @@ -755,31 +758,35 @@ def sdc_pandas_series_setitem_idx_bool_series_align_impl(self, idx, value):
value_is_scalar = not (value_is_series or value_is_array)
def sdc_pandas_series_setitem_idx_int_series_align_impl(self, idx, value):

# FIXME_Numba#6960: all changes of this commit are unnecessary - revert when resolved
self_data = self._data
self_index = self._index
self_index_size = len(self_index)
idx_size = len(idx)

_idx = idx._data if idx_is_series == True else idx # noqa
_value = value._data if value_is_series == True else value # noqa

self_index_size = len(self._index)
idx_size = len(_idx)
valid_indices = numpy.repeat(-1, self_index_size)
for i in numba.prange(self_index_size):
for j in numpy.arange(idx_size):
if self._index[i] == _idx[j]:
if self_index[i] == _idx[j]:
valid_indices[i] = j

valid_indices_positions = numpy.arange(self_index_size)[valid_indices != -1]
valid_indices_masked = valid_indices[valid_indices != -1]

indexes_found = self._index[valid_indices_positions]
indexes_found = self_index[valid_indices_positions]
if len(numpy.unique(indexes_found)) != len(indexes_found):
raise ValueError("Reindexing only valid with uniquely valued Index objects")

if len(valid_indices_masked) != idx_size:
raise KeyError("Reindexing not possible: idx has index not found in Series")

if value_is_scalar == True: # noqa
self._data[valid_indices_positions] = _value
self_data[valid_indices_positions] = _value
else:
self._data[valid_indices_positions] = numpy.take(_value, valid_indices_masked)
self_data[valid_indices_positions] = numpy.take(_value, valid_indices_masked)

return self

Expand Down Expand Up @@ -1598,17 +1605,18 @@ def hpat_pandas_series_var_impl(self, axis=None, skipna=None, level=None, ddof=1
if skipna is None:
skipna = True

self_data = self._data # FIXME_Numba#6960
if skipna:
valuable_length = len(self._data) - numpy.sum(numpy.isnan(self._data))
valuable_length = len(self_data) - numpy.sum(numpy.isnan(self_data))
if valuable_length <= ddof:
return numpy.nan

return numpy_like.nanvar(self._data) * valuable_length / (valuable_length - ddof)
return numpy_like.nanvar(self_data) * valuable_length / (valuable_length - ddof)

if len(self._data) <= ddof:
if len(self_data) <= ddof:
return numpy.nan

return self._data.var() * len(self._data) / (len(self._data) - ddof)
return self_data.var() * len(self_data) / (len(self_data) - ddof)

return hpat_pandas_series_var_impl

Expand Down Expand Up @@ -2859,8 +2867,9 @@ def hpat_pandas_series_prod_impl(self, axis=None, skipna=None, level=None, numer
else:
_skipna = skipna

series_data = self._data # FIXME_Numba#6960
if _skipna:
return numpy_like.nanprod(self._data)
return numpy_like.nanprod(series_data)
else:
return numpy.prod(self._data)

Expand Down Expand Up @@ -3079,8 +3088,9 @@ def hpat_pandas_series_min_impl(self, axis=None, skipna=None, level=None, numeri
else:
_skipna = skipna

series_data = self._data # FIXME_Numba#6960
if _skipna:
return numpy_like.nanmin(self._data)
return numpy_like.nanmin(series_data)

return self._data.min()

Expand Down Expand Up @@ -3156,8 +3166,9 @@ def hpat_pandas_series_max_impl(self, axis=None, skipna=None, level=None, numeri
else:
_skipna = skipna

series_data = self._data # FIXME_Numba#6960
if _skipna:
return numpy_like.nanmax(self._data)
return numpy_like.nanmax(series_data)

return self._data.max()

Expand Down Expand Up @@ -3222,8 +3233,9 @@ def hpat_pandas_series_mean_impl(self, axis=None, skipna=None, level=None, numer
else:
_skipna = skipna

series_data = self._data # FIXME_Numba#6960
if _skipna:
return numpy_like.nanmean(self._data)
return numpy_like.nanmean(series_data)

return self._data.mean()

Expand Down Expand Up @@ -3780,54 +3792,56 @@ def hpat_pandas_series_argsort(self, axis=0, kind='quicksort', order=None):

if not isinstance(self.index, PositionalIndexType):
def hpat_pandas_series_argsort_idx_impl(self, axis=0, kind='quicksort', order=None):
series_data = self._data # FIXME_Numba#6960
if kind != 'quicksort' and kind != 'mergesort':
raise ValueError("Method argsort(). Unsupported parameter. Given 'kind' != 'quicksort' or 'mergesort'")
if kind == 'mergesort':
#It is impossible to use numpy.argsort(self._data, kind=kind) since numba gives typing error
sort = numpy_like.argsort(self._data, kind='mergesort')
sort = numpy_like.argsort(series_data, kind='mergesort')
else:
sort = numpy_like.argsort(self._data)
sort = numpy_like.argsort(series_data)
na = self.isna().sum()
result = numpy.empty(len(self._data), dtype=numpy.int64)
na_data_arr = sdc.hiframes.api.get_nan_mask(self._data)
result = numpy.empty(len(series_data), dtype=numpy.int64)
na_data_arr = sdc.hiframes.api.get_nan_mask(series_data)
if kind == 'mergesort':
sort_nona = numpy_like.argsort(self._data[~na_data_arr], kind='mergesort')
sort_nona = numpy_like.argsort(series_data[~na_data_arr], kind='mergesort')
else:
sort_nona = numpy_like.argsort(self._data[~na_data_arr])
sort_nona = numpy_like.argsort(series_data[~na_data_arr])
q = 0
for id, i in enumerate(sort):
if id in set(sort[len(self._data) - na:]):
if id in set(sort[len(series_data) - na:]):
q += 1
else:
result[id] = sort_nona[id - q]
for i in sort[len(self._data) - na:]:
for i in sort[len(series_data) - na:]:
result[i] = -1

return pandas.Series(result, self._index)

return hpat_pandas_series_argsort_idx_impl

def hpat_pandas_series_argsort_noidx_impl(self, axis=0, kind='quicksort', order=None):
series_data = self._data # FIXME_Numba#6960
if kind != 'quicksort' and kind != 'mergesort':
raise ValueError("Method argsort(). Unsupported parameter. Given 'kind' != 'quicksort' or 'mergesort'")
if kind == 'mergesort':
sort = numpy_like.argsort(self._data, kind='mergesort')
sort = numpy_like.argsort(series_data, kind='mergesort')
else:
sort = numpy_like.argsort(self._data)
sort = numpy_like.argsort(series_data)
na = self.isna().sum()
result = numpy.empty(len(self._data), dtype=numpy.int64)
na_data_arr = sdc.hiframes.api.get_nan_mask(self._data)
result = numpy.empty(len(series_data), dtype=numpy.int64)
na_data_arr = sdc.hiframes.api.get_nan_mask(series_data)
if kind == 'mergesort':
sort_nona = numpy_like.argsort(self._data[~na_data_arr], kind='mergesort')
sort_nona = numpy_like.argsort(series_data[~na_data_arr], kind='mergesort')
else:
sort_nona = numpy_like.argsort(self._data[~na_data_arr])
sort_nona = numpy_like.argsort(series_data[~na_data_arr])
q = 0
for id, i in enumerate(sort):
if id in set(sort[len(self._data) - na:]):
if id in set(sort[len(series_data) - na:]):
q += 1
else:
result[id] = sort_nona[id - q]
for i in sort[len(self._data) - na:]:
for i in sort[len(series_data) - na:]:
result[i] = -1

return pandas.Series(result)
Expand Down
Loading