From 0c34556169a54f404f62f5648dca01bc9438c841 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scott.gigante@immunai.com>
Date: Wed, 11 May 2022 12:14:27 -0400
Subject: [PATCH] fix tests

---
 scprep/stats.py    | 32 +++++++++++++++++---------------
 scprep/utils.py    | 27 ++++++++++++++++++++++++++-
 test/test_stats.py | 36 ++++++++++++++++++++++++++++--------
 3 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/scprep/stats.py b/scprep/stats.py
index a35ebae7..45807d5e 100644
--- a/scprep/stats.py
+++ b/scprep/stats.py
@@ -1,3 +1,5 @@
+import scipy.sparse
+
 from . import plot
 from . import select
 from . import utils
@@ -84,24 +86,24 @@ def pairwise_correlation(X, Y, ignore_nan=False):
     Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan)
     # Basically there are four parts in the formula. We would compute them
     # one-by-one
-    N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
-    sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
-    var_x = (
-        N
-        * utils.matrix_sum(
-            utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
-        )
-        - X_colsums**2
+    X_sq_colsums = utils.matrix_sum(
+        utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
     )
-    var_y = (
-        N
-        * utils.matrix_sum(
-            utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
-        )
-        - Y_colsums**2
+    Y_sq_colsums = utils.matrix_sum(
+        utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
     )
+    var_x = N * X_sq_colsums - X_colsums**2
+    var_y = N * Y_sq_colsums - Y_colsums**2
+    if ignore_nan:
+        # now that we have the variance computed we can fill in the NaNs
+        X = utils.fillna(X, 0)
+        Y = utils.fillna(Y, 0)
+    N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
+    sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
     # Finally compute Pearson Correlation Coefficient as 2D array
-    cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning)
+        cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
     return cor.T
 
 
diff --git a/scprep/utils.py b/scprep/utils.py
index 77e89630..c9dd4b4b 100644
--- a/scprep/utils.py
+++ b/scprep/utils.py
@@ -378,6 +378,31 @@ def matrix_transform(data, fun, *args, **kwargs):
     return data
 
 
+def fillna(data, fill, copy=True):
+    return_cls = None
+    if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
+        return_cls = type(data)
+        assert copy, f"Cannot fillna in-place for {return_cls.__name__}"
+        data = data.tocsr()
+    elif copy:
+        data = data.copy()
+    if sparse.issparse(data):
+        data.data[np.isnan(data.data)] = fill
+        if return_cls is not None:
+            data = return_cls(data)
+    else:
+        data[np.isnan(data)] = fill
+    return data
+
+
+
+def _nansum(data, axis=None):
+    if sparse.issparse(data):
+        return np.sum(fillna(data, 0), axis=axis)
+    else:
+        return np.nansum(data, axis=axis)
+
+
 def matrix_sum(data, axis=None, ignore_nan=False):
     """Get the column-wise, row-wise, or total sum of values in a matrix.
 
@@ -396,7 +421,7 @@ def matrix_sum(data, axis=None, ignore_nan=False):
     sums : array-like or float
         Sums along desired axis.
     """
-    sum_fn = np.nansum if ignore_nan else np.sum
+    sum_fn = _nansum if ignore_nan else np.sum
     if axis not in [0, 1, None]:
         raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
     if isinstance(data, pd.DataFrame):
diff --git a/test/test_stats.py b/test/test_stats.py
index 5c6d0347..6b6ef4af 100644
--- a/test/test_stats.py
+++ b/test/test_stats.py
@@ -1,4 +1,6 @@
 from functools import partial
+
+import scipy.sparse
 from parameterized import parameterized
 from scipy import stats
 from tools import data
@@ -93,7 +95,7 @@ def test_fun(X, *args, **kwargs):
         )
 
     D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5)
-    Y = test_fun(D)
+    Y = np.corrcoef(D.T)[:,:10]
     assert Y.shape == (D.shape[1], 10)
     assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0)
     matrix.test_all_matrix_types(
@@ -127,13 +129,31 @@ def test_fun(X, *args, **kwargs):
 
 
 def test_pairwise_correlation_nan():
-    D = np.array([np.arange(10), np.arange(0, 20, 2)]).astype(float)
-    D[:, 3] = np.nan
-    C = scprep.stats.pairwise_correlation(D, D)
-    assert np.all(np.isnan(C))
-    C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
-    assert not np.any(np.isnan(C))
-    np.testing.assert_equal(C, 1)
+    D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T
+    D[3,:] = np.nan
+
+    def test_with_nan(D):
+        C = scprep.stats.pairwise_correlation(D, D)
+        assert np.all(np.isnan(C))
+
+    matrix.test_all_matrix_types(
+        D,
+        test_with_nan,
+    )
+
+    def test_with_ignore_nan(D):
+        C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
+        # should still be NaN on samples that have no variance
+        assert np.all(np.isnan(C[-1]))
+        assert np.all(np.isnan(C[:,-1]))
+        # but shouldn't be NaN on samples that have some NaNs
+        assert not np.any(np.isnan(C[:2][:,:2]))
+        np.testing.assert_equal(C[:2][:,:2], 1)
+
+    matrix.test_all_matrix_types(
+        D,
+        test_with_ignore_nan,
+    )
 
 
 def shan_entropy(c):