Skip to content

Commit

Permalink
PERF: nancorr_spearman fastpath (pandas-dev#41885)
Browse files Browse the repository at this point in the history
  • Loading branch information
mzeitlin11 authored and JulianWgs committed Jul 3, 2021
1 parent f3263b0 commit 0c086fe
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 37 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ Performance improvements
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad"`` or ``method="backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`, :issue:`41885`)
- Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`)
- Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`)
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)
Expand Down
95 changes: 59 additions & 36 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -387,15 +387,23 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
float64_t[::1] maskedx, maskedy
ndarray[uint8_t, ndim=2] mask
int64_t nobs = 0
bint no_nans
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
const int64_t[:] labels_n, labels_nobs

N, K = (<object>mat).shape
# For compatibility when calling rank_1d
labels_n = np.zeros(N, dtype=np.int64)

# Handle the edge case where we know all results will be nan
# to keep conditional logic inside loop simpler
if N < minp:
result = np.full((K, K), np.nan, dtype=np.float64)
return result

result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat).view(np.uint8)
no_nans = mask.all()

ranked_mat = np.empty((N, K), dtype=np.float64)

Expand All @@ -409,51 +417,66 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
with nogil:
for xi in range(K):
for yi in range(xi + 1):
nobs = 0
# Keep track of whether we need to recompute ranks
all_ranks = True
for i in range(N):
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
if mask[i, xi] and mask[i, yi]:
maskedx[nobs] = ranked_mat[i, xi]
maskedy[nobs] = ranked_mat[i, yi]
nobs += 1

if nobs < minp:
result[xi, yi] = result[yi, xi] = NaN
else:
if not all_ranks:
with gil:
# We need to slice back to nobs because rank_1d will
# require arrays of nobs length
labels_nobs = np.zeros(nobs, dtype=np.int64)
rankedx = rank_1d(np.array(maskedx)[:nobs],
labels=labels_nobs)
rankedy = rank_1d(np.array(maskedy)[:nobs],
labels=labels_nobs)
for i in range(nobs):
maskedx[i] = rankedx[i]
maskedy[i] = rankedy[i]
sumx = sumxx = sumyy = 0

mean = (nobs + 1) / 2.
# Fastpath for data with no nans/infs, allows avoiding mask checks
# and array reassignments
if no_nans:
mean = (N + 1) / 2.

# now the cov numerator
sumx = sumxx = sumyy = 0

for i in range(nobs):
vx = maskedx[i] - mean
vy = maskedy[i] - mean
for i in range(N):
vx = ranked_mat[i, xi] - mean
vy = ranked_mat[i, yi] - mean

sumx += vx * vy
sumxx += vx * vx
sumyy += vy * vy
else:
nobs = 0
# Keep track of whether we need to recompute ranks
all_ranks = True
for i in range(N):
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
if mask[i, xi] and mask[i, yi]:
maskedx[nobs] = ranked_mat[i, xi]
maskedy[nobs] = ranked_mat[i, yi]
nobs += 1

if nobs < minp:
result[xi, yi] = result[yi, xi] = NaN
continue
else:
if not all_ranks:
with gil:
# We need to slice back to nobs because rank_1d will
# require arrays of nobs length
labels_nobs = np.zeros(nobs, dtype=np.int64)
rankedx = rank_1d(np.array(maskedx)[:nobs],
labels=labels_nobs)
rankedy = rank_1d(np.array(maskedy)[:nobs],
labels=labels_nobs)
for i in range(nobs):
maskedx[i] = rankedx[i]
maskedy[i] = rankedy[i]

mean = (nobs + 1) / 2.

# now the cov numerator
for i in range(nobs):
vx = maskedx[i] - mean
vy = maskedy[i] - mean

divisor = sqrt(sumxx * sumyy)
sumx += vx * vy
sumxx += vx * vx
sumyy += vy * vy

if divisor != 0:
result[xi, yi] = result[yi, xi] = sumx / divisor
else:
result[xi, yi] = result[yi, xi] = NaN
divisor = sqrt(sumxx * sumyy)

if divisor != 0:
result[xi, yi] = result[yi, xi] = sumx / divisor
else:
result[xi, yi] = result[yi, xi] = NaN

return result

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,16 @@ def test_calc_corr_small_numbers(self):
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
tm.assert_frame_equal(result, expected)

@td.skip_if_no_scipy
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
def test_corr_min_periods_greater_than_length(self, method):
df = DataFrame({"A": [1, 2], "B": [1, 2]})
result = df.corr(method=method, min_periods=3)
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)


class TestDataFrameCorrWith:
def test_corrwith(self, datetime_frame):
Expand Down

0 comments on commit 0c086fe

Please sign in to comment.