Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 59 additions & 5 deletions sdc/datatypes/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,14 @@
from numba import numpy_support

import sdc
from sdc.str_arr_type import string_array_type
from sdc.str_arr_ext import (num_total_chars, append_string_array_to,
str_arr_is_na, pre_alloc_string_array, str_arr_set_na,
cp_str_list_to_array)
from sdc.hiframes.pd_series_type import SeriesType
from sdc.str_arr_ext import (
append_string_array_to, cp_str_list_to_array, num_total_chars,
pre_alloc_string_array, str_arr_is_na, str_arr_set_na, string_array_type
)
from sdc.utilities.utils import sdc_overload, sdc_register_jitable
from sdc.utilities.sdc_typing_utils import find_common_dtype_from_numpy_dtypes
from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes,
TypeChecker)


def hpat_arrays_append(A, B):
Expand Down Expand Up @@ -537,3 +539,55 @@ def _sdc_pandas_series_check_axis_impl(axis):
return _sdc_pandas_series_check_axis_impl

return None


def _sdc_pandas_series_align(series, other, size='max', finiteness=False):
"""
Align series and other series by
size where size of output series is max/min size of input series
finiteness where all the infinite and matched finite values are replaced with nans, e.g.
series: [1., inf, inf, -1., 0.] -> [1., nan, nan, -1., 0.]
other: [1., -1., 0., 0.1, -0.1] -> [1., nan, nan, 0.1, -0.1]
"""
pass


@sdc_overload(_sdc_pandas_series_align, jit_options={'parallel': False})
def _sdc_pandas_series_align_overload(series, other, size='max', finiteness=False):
ty_checker = TypeChecker('Function sdc.common_functions._sdc_pandas_series_align().')
ty_checker.check(series, SeriesType)
ty_checker.check(other, SeriesType)

str_types = (str, types.StringLiteral, types.UnicodeType, types.Omitted)
if not isinstance(size, str_types):
ty_checker.raise_exc(size, 'str', 'size')

if not isinstance(finiteness, (bool, types.Boolean, types.Omitted)):
ty_checker.raise_exc(finiteness, 'bool', 'finiteness')

def _sdc_pandas_series_align_impl(series, other, size='max', finiteness=False):
if size != 'max' and size != 'min':
raise ValueError("Function sdc.common_functions._sdc_pandas_series_align(). "
"The object size\n expected: 'max' or 'min'")

arr, other_arr = series._data, other._data
arr_len, other_arr_len = len(arr), len(other_arr)
min_length = min(arr_len, other_arr_len)
length = max(arr_len, other_arr_len) if size == 'max' else min_length

aligned_arr = numpy.repeat([numpy.nan], length)
aligned_other_arr = numpy.repeat([numpy.nan], length)

for i in numba.prange(min_length):
if not finiteness or (numpy.isfinite(arr[i]) and numpy.isfinite(other_arr[i])):
aligned_arr[i] = arr[i]
aligned_other_arr[i] = other_arr[i]
else:
aligned_arr[i] = aligned_other_arr[i] = numpy.nan

aligned = pandas.Series(aligned_arr, name=series._name)
aligned_other = pandas.Series(aligned_other_arr, name=other._name)

return aligned, aligned_other

return _sdc_pandas_series_align_impl
118 changes: 95 additions & 23 deletions sdc/datatypes/hpat_pandas_dataframe_rolling_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def df_rolling_method_other_df_codegen(method_name, self, other, args=None, kws=
' else:',
' _pairwise = pairwise',
' if _pairwise:',
' raise ValueError("Method rolling.corr(). The object pairwise\\n expected: False, None")'
f' raise ValueError("Method rolling.{method_name}(). The object pairwise\\n expected: False, None")'
]

data_length = 'len(get_dataframe_data(self._data, 0))' if data_columns else '0'
Expand All @@ -139,7 +139,7 @@ def df_rolling_method_other_df_codegen(method_name, self, other, args=None, kws=
f' series_{col} = pandas.Series(data_{col})',
f' {other_series} = pandas.Series(other_data_{col})',
f' rolling_{col} = series_{col}.rolling({rolling_params})',
f' result_{col} = rolling_{col}.corr({method_params})',
f' result_{col} = rolling_{col}.{method_name}({method_params})',
f' {res_data} = result_{col}._data[:length]'
]
else:
Expand Down Expand Up @@ -182,32 +182,41 @@ def df_rolling_method_main_codegen(method_params, df_columns, method_name):
return func_lines


def df_rolling_method_other_none_codegen(method_name, self, args=None, kws=None):
args = args or []
kwargs = kws or {}
def gen_df_rolling_method_other_none_codegen(rewrite_name=None):
"""Generate df.rolling method code generator based on name of the method"""
def df_rolling_method_other_none_codegen(method_name, self, args=None, kws=None):
_method_name = rewrite_name or method_name
args = args or []
kwargs = kws or {}

impl_params = ['self'] + args + params2list(kwargs)
impl_params_as_str = ', '.join(impl_params)
impl_params = ['self'] + args + params2list(kwargs)
impl_params_as_str = ', '.join(impl_params)

impl_name = f'_df_rolling_{method_name}_other_none_impl'
func_lines = [f'def {impl_name}({impl_params_as_str}):']
impl_name = f'_df_rolling_{_method_name}_other_none_impl'
func_lines = [f'def {impl_name}({impl_params_as_str}):']

if 'pairwise' in kwargs:
func_lines += [
' if pairwise is None:',
' _pairwise = True',
' else:',
' _pairwise = pairwise',
' if _pairwise:',
' raise ValueError("Method rolling.corr(). The object pairwise\\n expected: False")'
]
method_params = args + ['{}={}'.format(k, k) for k in kwargs if k != 'other']
func_lines += df_rolling_method_main_codegen(method_params, self.data.columns, method_name)
func_text = '\n'.join(func_lines)
if 'pairwise' in kwargs:
func_lines += [
' if pairwise is None:',
' _pairwise = True',
' else:',
' _pairwise = pairwise',
' if _pairwise:',
f' raise ValueError("Method rolling.{_method_name}(). The object pairwise\\n expected: False")'
]
method_params = args + ['{}={}'.format(k, k) for k in kwargs if k != 'other']
func_lines += df_rolling_method_main_codegen(method_params, self.data.columns, method_name)
func_text = '\n'.join(func_lines)

global_vars = {'pandas': pandas, 'get_dataframe_data': get_dataframe_data}
global_vars = {'pandas': pandas, 'get_dataframe_data': get_dataframe_data}

return func_text, global_vars
return func_text, global_vars

return df_rolling_method_other_none_codegen


df_rolling_method_other_none_codegen = gen_df_rolling_method_other_none_codegen()
df_rolling_cov_other_none_codegen = gen_df_rolling_method_other_none_codegen('cov')


def df_rolling_method_codegen(method_name, self, args=None, kws=None):
Expand Down Expand Up @@ -249,6 +258,16 @@ def gen_df_rolling_method_other_none_impl(method_name, self, args=None, kws=None
return _impl


def gen_df_rolling_cov_other_none_impl(method_name, self, args=None, kws=None):
func_text, global_vars = df_rolling_cov_other_none_codegen(method_name, self,
args=args, kws=kws)
loc_vars = {}
exec(func_text, global_vars, loc_vars)
_impl = loc_vars[f'_df_rolling_cov_other_none_impl']

return _impl


def gen_df_rolling_method_impl(method_name, self, args=None, kws=None):
func_text, global_vars = df_rolling_method_codegen(method_name, self,
args=args, kws=kws)
Expand Down Expand Up @@ -308,6 +327,37 @@ def sdc_pandas_dataframe_rolling_count(self):
return gen_df_rolling_method_impl('count', self)


@sdc_overload_method(DataFrameRollingType, 'cov')
def sdc_pandas_dataframe_rolling_cov(self, other=None, pairwise=None, ddof=1):

ty_checker = TypeChecker('Method rolling.cov().')
ty_checker.check(self, DataFrameRollingType)

accepted_other = (Omitted, NoneType, DataFrameType, SeriesType)
if not isinstance(other, accepted_other) and other is not None:
ty_checker.raise_exc(other, 'DataFrame, Series', 'other')

accepted_pairwise = (bool, Boolean, Omitted, NoneType)
if not isinstance(pairwise, accepted_pairwise) and pairwise is not None:
ty_checker.raise_exc(pairwise, 'bool', 'pairwise')

if not isinstance(ddof, (int, Integer, Omitted)):
ty_checker.raise_exc(ddof, 'int', 'ddof')

none_other = isinstance(other, (Omitted, NoneType)) or other is None
kws = {'other': 'None', 'pairwise': 'None', 'ddof': '1'}

if none_other:
# method _df_cov in comparison to method cov doesn't align input data
# by replacing infinite and matched finite values with nans
return gen_df_rolling_cov_other_none_impl('_df_cov', self, kws=kws)

if isinstance(other, DataFrameType):
return gen_df_rolling_method_other_df_impl('cov', self, other, kws=kws)

return gen_df_rolling_method_impl('cov', self, kws=kws)


@sdc_overload_method(DataFrameRollingType, 'kurt')
def sdc_pandas_dataframe_rolling_kurt(self):

Expand Down Expand Up @@ -457,6 +507,28 @@ def sdc_pandas_dataframe_rolling_var(self, ddof=1):
'extra_params': ''
})

sdc_pandas_dataframe_rolling_cov.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
'method_name': 'cov',
'example_caption': 'Calculate rolling covariance.',
'limitations_block':
"""
Limitations
-----------
DataFrame elements cannot be max/min float/integer. Otherwise SDC and Pandas results are different.
Different size of `self` and `other` can produce result different from the result of Pandas
due to different float rounding in Python and SDC.
""",
'extra_params':
"""
other: :obj:`Series` or :obj:`DataFrame`
Other Series or DataFrame.
pairwise: :obj:`bool`
Calculate pairwise combinations of columns within a DataFrame.
ddof: :obj:`int`
Delta Degrees of Freedom.
"""
})

sdc_pandas_dataframe_rolling_kurt.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
'method_name': 'kurt',
'example_caption': 'Calculate unbiased rolling kurtosis.',
Expand Down
82 changes: 36 additions & 46 deletions sdc/datatypes/hpat_pandas_series_rolling_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@
from numba.types import (float64, Boolean, Integer, NoneType, Number,
Omitted, StringLiteral, UnicodeType)

from sdc.utilities.sdc_typing_utils import TypeChecker
from sdc.datatypes.common_functions import _sdc_pandas_series_align
from sdc.datatypes.hpat_pandas_series_rolling_types import SeriesRollingType
from sdc.hiframes.pd_series_type import SeriesType
from sdc.utilities.sdc_typing_utils import TypeChecker
from sdc.utilities.utils import sdc_overload_method, sdc_register_jitable


Expand Down Expand Up @@ -111,15 +113,6 @@ def arr_nonnan_count(arr):
return len(arr) - numpy.isnan(arr).sum()


@sdc_register_jitable
def arr_cov(x, y, ddof):
"""Calculate covariance of values 1D arrays x and y of the same size"""
if len(x) == 0:
return numpy.nan

return numpy.cov(x, y, ddof=ddof)[0, 1]


@sdc_register_jitable
def _moment(arr, moment):
mn = numpy.mean(arr)
Expand Down Expand Up @@ -451,16 +444,15 @@ def hpat_pandas_rolling_series_count_impl(self):
return hpat_pandas_rolling_series_count_impl


@sdc_rolling_overload(SeriesRollingType, 'cov')
def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):

def _hpat_pandas_series_rolling_cov_check_types(self, other=None,
pairwise=None, ddof=1):
"""Check types of parameters of series.rolling.cov()"""
ty_checker = TypeChecker('Method rolling.cov().')
ty_checker.check(self, SeriesRollingType)

# TODO: check `other` is Series after a circular import of SeriesType fixed
# accepted_other = (bool, Omitted, NoneType, SeriesType)
# if not isinstance(other, accepted_other) and other is not None:
# ty_checker.raise_exc(other, 'Series', 'other')
accepted_other = (bool, Omitted, NoneType, SeriesType)
if not isinstance(other, accepted_other) and other is not None:
ty_checker.raise_exc(other, 'Series', 'other')

accepted_pairwise = (bool, Boolean, Omitted, NoneType)
if not isinstance(pairwise, accepted_pairwise) and pairwise is not None:
Expand All @@ -469,50 +461,48 @@ def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
if not isinstance(ddof, (int, Integer, Omitted)):
ty_checker.raise_exc(ddof, 'int', 'ddof')


def _gen_hpat_pandas_rolling_series_cov_impl(other, align_finiteness=False):
"""Generate series.rolling.cov() implementation based on series alignment"""
nan_other = isinstance(other, (Omitted, NoneType)) or other is None

def hpat_pandas_rolling_series_cov_impl(self, other=None, pairwise=None, ddof=1):
def _impl(self, other=None, pairwise=None, ddof=1):
win = self._window
minp = self._min_periods

main_series = self._data
main_arr = main_series._data
main_arr_length = len(main_arr)

if nan_other == True: # noqa
other_arr = main_arr
other_series = main_series
else:
other_arr = other._data
other_series = other

other_arr_length = len(other_arr)
length = max(main_arr_length, other_arr_length)
output_arr = numpy.empty(length, dtype=float64)
main_aligned, other_aligned = _sdc_pandas_series_align(main_series, other_series,
finiteness=align_finiteness)
count = (main_aligned + other_aligned).rolling(win).count()
bias_adj = count / (count - ddof)

def calc_cov(main, other, ddof, minp):
# align arrays `main` and `other` by size and finiteness
min_length = min(len(main), len(other))
main_valid_indices = numpy.isfinite(main[:min_length])
other_valid_indices = numpy.isfinite(other[:min_length])
valid = main_valid_indices & other_valid_indices
def mean(series):
return series.rolling(win, min_periods=minp).mean()

if len(main[valid]) < minp:
return numpy.nan
else:
return arr_cov(main[valid], other[valid], ddof)
return (mean(main_aligned * other_aligned) - mean(main_aligned) * mean(other_aligned)) * bias_adj

for i in prange(min(win, length)):
main_arr_range = main_arr[:i + 1]
other_arr_range = other_arr[:i + 1]
output_arr[i] = calc_cov(main_arr_range, other_arr_range, ddof, minp)
return _impl

for i in prange(win, length):
main_arr_range = main_arr[i + 1 - win:i + 1]
other_arr_range = other_arr[i + 1 - win:i + 1]
output_arr[i] = calc_cov(main_arr_range, other_arr_range, ddof, minp)

return pandas.Series(output_arr)
@sdc_rolling_overload(SeriesRollingType, 'cov')
def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
_hpat_pandas_series_rolling_cov_check_types(self, other=other,
pairwise=pairwise, ddof=ddof)

return _gen_hpat_pandas_rolling_series_cov_impl(other, align_finiteness=True)


@sdc_rolling_overload(SeriesRollingType, '_df_cov')
def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
_hpat_pandas_series_rolling_cov_check_types(self, other=other,
pairwise=pairwise, ddof=ddof)

return hpat_pandas_rolling_series_cov_impl
return _gen_hpat_pandas_rolling_series_cov_impl(other)


@sdc_rolling_overload(SeriesRollingType, 'kurt')
Expand Down
Loading