From 0c34556169a54f404f62f5648dca01bc9438c841 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 11 May 2022 12:14:27 -0400 Subject: [PATCH] fix tests --- scprep/stats.py | 32 +++++++++++++++++--------------- scprep/utils.py | 27 ++++++++++++++++++++++++++- test/test_stats.py | 36 ++++++++++++++++++++++++++++-------- 3 files changed, 71 insertions(+), 24 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index a35ebae7..45807d5e 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -1,3 +1,5 @@ +import scipy.sparse + from . import plot from . import select from . import utils @@ -84,24 +86,24 @@ def pairwise_correlation(X, Y, ignore_nan=False): Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan) # Basically there are four parts in the formula. We would compute them # one-by-one - N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) - sum_x_times_sum_y = X_colsums * Y_colsums[:, None] - var_x = ( - N - * utils.matrix_sum( - utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan - ) - - X_colsums**2 + X_sq_colsums = utils.matrix_sum( + utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan ) - var_y = ( - N - * utils.matrix_sum( - utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan - ) - - Y_colsums**2 + Y_sq_colsums = utils.matrix_sum( + utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan ) + var_x = N * X_sq_colsums - X_colsums**2 + var_y = N * Y_sq_colsums - Y_colsums**2 + if ignore_nan: + # now that we have the variance computed we can fill in the NaNs + X = utils.fillna(X, 0) + Y = utils.fillna(Y, 0) + N_times_sum_xy = utils.toarray(N * Y.T.dot(X)) + sum_x_times_sum_y = X_colsums * Y_colsums[:, None] # Finally compute Pearson Correlation Coefficient as 2D array - cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning) + cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None]) return cor.T diff --git a/scprep/utils.py b/scprep/utils.py index 77e89630..c9dd4b4b 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -378,6 +378,31 @@ def matrix_transform(data, fun, *args, **kwargs): return data +def fillna(data, fill, copy=True): + return_cls = None + if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)): + return_cls = type(data) + assert copy, f"Cannot fillna in-place for {return_cls.__name__}" + data = data.tocsr() + elif copy: + data = data.copy() + if sparse.issparse(data): + data.data[np.isnan(data.data)] = fill + if return_cls is not None: + data = return_cls(data) + else: + data[np.isnan(data)] = fill + return data + + + +def _nansum(data, axis=None): + if sparse.issparse(data): + return np.sum(fillna(data, 0), axis=axis) + else: + return np.nansum(data, axis=axis) + + def matrix_sum(data, axis=None, ignore_nan=False): """Get the column-wise, row-wise, or total sum of values in a matrix. @@ -396,7 +421,7 @@ def matrix_sum(data, axis=None, ignore_nan=False): sums : array-like or float Sums along desired axis. """ - sum_fn = np.nansum if ignore_nan else np.sum + sum_fn = _nansum if ignore_nan else np.sum if axis not in [0, 1, None]: raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis)) if isinstance(data, pd.DataFrame): diff --git a/test/test_stats.py b/test/test_stats.py index 5c6d0347..6b6ef4af 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -1,4 +1,6 @@ from functools import partial + +import scipy.sparse from parameterized import parameterized from scipy import stats from tools import data @@ -93,7 +95,7 @@ def test_fun(X, *args, **kwargs): ) D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5) - Y = test_fun(D) + Y = np.corrcoef(D.T)[:,:10] assert Y.shape == (D.shape[1], 10) assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0) matrix.test_all_matrix_types( @@ -127,13 +129,31 @@ def test_fun(X, *args, **kwargs): def test_pairwise_correlation_nan(): - D = np.array([np.arange(10), np.arange(0, 20, 2)]).astype(float) - D[:, 3] = np.nan - C = scprep.stats.pairwise_correlation(D, D) - assert np.all(np.isnan(C)) - C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) - assert not np.any(np.isnan(C)) - np.testing.assert_equal(C, 1) + D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T + D[3,:] = np.nan + + def test_with_nan(D): + C = scprep.stats.pairwise_correlation(D, D) + assert np.all(np.isnan(C)) + + matrix.test_all_matrix_types( + D, + test_with_nan, + ) + + def test_with_ignore_nan(D): + C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True) + # should still be NaN on samples that have no variance + assert np.all(np.isnan(C[-1])) + assert np.all(np.isnan(C[:,-1])) + # but shouldn't be NaN on samples that have some NaNs + assert not np.any(np.isnan(C[:2][:,:2])) + np.testing.assert_equal(C[:2][:,:2], 1) + + matrix.test_all_matrix_types( + D, + test_with_ignore_nan, + ) def shan_entropy(c):