Merge 0c34556 into 24c08f3

KrishnaswamyLab · May 11, 2022 · e2c226c · e2c226c
2 parents 24c08f3 + 0c34556
commit e2c226c
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 19 deletions.
diff --git a/scprep/stats.py b/scprep/stats.py
@@ -1,3 +1,5 @@
+import scipy.sparse
+
 from . import plot
 from . import select
 from . import utils
@@ -50,7 +52,7 @@ def EMD(x, y):
     return stats.wasserstein_distance(x, y)
 
 
-def pairwise_correlation(X, Y):
+def pairwise_correlation(X, Y, ignore_nan=False):
     """Compute pairwise Pearson correlation between columns of two matrices.
 
     From https://stackoverflow.com/a/33651442/3996580
@@ -61,6 +63,8 @@ def pairwise_correlation(X, Y):
         Input data
     Y : array-like, shape=[n_samples, p_features]
         Input data
+    ignore_nan : bool, optional (default: False)
+        If True, ignore NaNs, computing correlation over remaining values
 
     Returns
     -------
@@ -78,20 +82,28 @@ def pairwise_correlation(X, Y):
     if sparse.issparse(Y) and not sparse.issparse(X):
         X = sparse.csr_matrix(X)
     # Store columnw-wise in X and Y, as they would be used at few places
-    X_colsums = utils.matrix_sum(X, axis=0)
-    Y_colsums = utils.matrix_sum(Y, axis=0)
+    X_colsums = utils.matrix_sum(X, axis=0, ignore_nan=ignore_nan)
+    Y_colsums = utils.matrix_sum(Y, axis=0, ignore_nan=ignore_nan)
     # Basically there are four parts in the formula. We would compute them
     # one-by-one
-    N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
-    sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
-    var_x = N * utils.matrix_sum(utils.matrix_transform(X, np.power, 2), axis=0) - (
-        X_colsums**2
+    X_sq_colsums = utils.matrix_sum(
+        utils.matrix_transform(X, np.power, 2), axis=0, ignore_nan=ignore_nan
     )
-    var_y = N * utils.matrix_sum(utils.matrix_transform(Y, np.power, 2), axis=0) - (
-        Y_colsums**2
+    Y_sq_colsums = utils.matrix_sum(
+        utils.matrix_transform(Y, np.power, 2), axis=0, ignore_nan=ignore_nan
     )
+    var_x = N * X_sq_colsums - X_colsums**2
+    var_y = N * Y_sq_colsums - Y_colsums**2
+    if ignore_nan:
+        # now that we have the variance computed we can fill in the NaNs
+        X = utils.fillna(X, 0)
+        Y = utils.fillna(Y, 0)
+    N_times_sum_xy = utils.toarray(N * Y.T.dot(X))
+    sum_x_times_sum_y = X_colsums * Y_colsums[:, None]
     # Finally compute Pearson Correlation Coefficient as 2D array
-    cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="invalid value encountered in true_divide", category=RuntimeWarning)
+        cor = (N_times_sum_xy - sum_x_times_sum_y) / np.sqrt(var_x * var_y[:, None])
     return cor.T
 
 

diff --git a/scprep/utils.py b/scprep/utils.py
@@ -378,7 +378,32 @@ def matrix_transform(data, fun, *args, **kwargs):
     return data
 
 
-def matrix_sum(data, axis=None):
+def fillna(data, fill, copy=True):
+    return_cls = None
+    if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
+        return_cls = type(data)
+        assert copy, f"Cannot fillna in-place for {return_cls.__name__}"
+        data = data.tocsr()
+    elif copy:
+        data = data.copy()
+    if sparse.issparse(data):
+        data.data[np.isnan(data.data)] = fill
+        if return_cls is not None:
+            data = return_cls(data)
+    else:
+        data[np.isnan(data)] = fill
+    return data
+
+
+
+def _nansum(data, axis=None):
+    if sparse.issparse(data):
+        return np.sum(fillna(data, 0), axis=axis)
+    else:
+        return np.nansum(data, axis=axis)
+
+
+def matrix_sum(data, axis=None, ignore_nan=False):
     """Get the column-wise, row-wise, or total sum of values in a matrix.
 
     Parameters
@@ -388,37 +413,40 @@ def matrix_sum(data, axis=None):
     axis : int or None, optional (default: None)
         Axis across which to sum. axis=0 gives column sums,
         axis=1 gives row sums. None gives the total sum.
+    ignore_nan : bool, optional (default: False)
+        If True, uses `np.nansum` instead of `np.sum`
 
     Returns
     -------
     sums : array-like or float
         Sums along desired axis.
     """
+    sum_fn = _nansum if ignore_nan else np.sum
     if axis not in [0, 1, None]:
         raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
     if isinstance(data, pd.DataFrame):
         if is_SparseDataFrame(data):
             if axis is None:
-                sums = data.to_coo().sum()
+                sums = sum_fn(data.to_coo())
             else:
                 index = data.index if axis == 1 else data.columns
                 sums = pd.Series(
-                    np.array(data.to_coo().sum(axis)).flatten(), index=index
+                    np.array(sum_fn(data.to_coo(), axis)).flatten(), index=index
                 )
         elif is_sparse_dataframe(data):
             if axis is None:
-                sums = data.sparse.to_coo().sum()
+                sums = sum_fn(data.sparse.to_coo())
             else:
                 index = data.index if axis == 1 else data.columns
                 sums = pd.Series(
-                    np.array(data.sparse.to_coo().sum(axis)).flatten(), index=index
+                    np.array(sum_fn(data.sparse.to_coo(), axis)).flatten(), index=index
                 )
         elif axis is None:
-            sums = data.to_numpy().sum()
+            sums = sum_fn(data.to_numpy())
         else:
-            sums = data.sum(axis)
+            sums = sum_fn(data, axis)
     else:
-        sums = np.sum(data, axis=axis)
+        sums = sum_fn(data, axis=axis)
         if isinstance(sums, np.matrix):
             sums = np.array(sums).flatten()
     return sums

diff --git a/test/test_stats.py b/test/test_stats.py
@@ -1,4 +1,6 @@
 from functools import partial
+
+import scipy.sparse
 from parameterized import parameterized
 from scipy import stats
 from tools import data
@@ -93,7 +95,7 @@ def test_fun(X, *args, **kwargs):
         )
 
     D = data.generate_positive_sparse_matrix(shape=(500, 100), seed=42, poisson_mean=5)
-    Y = test_fun(D)
+    Y = np.corrcoef(D.T)[:,:10]
     assert Y.shape == (D.shape[1], 10)
     assert np.allclose(Y[(np.arange(10), np.arange(10))], 1, atol=0)
     matrix.test_all_matrix_types(
@@ -126,6 +128,34 @@ def test_fun(X, *args, **kwargs):
     )
 
 
+def test_pairwise_correlation_nan():
+    D = np.array([np.arange(10), np.arange(0, 20, 2), np.zeros(10)]).astype(float).T
+    D[3,:] = np.nan
+
+    def test_with_nan(D):
+        C = scprep.stats.pairwise_correlation(D, D)
+        assert np.all(np.isnan(C))
+
+    matrix.test_all_matrix_types(
+        D,
+        test_with_nan,
+    )
+
+    def test_with_ignore_nan(D):
+        C = scprep.stats.pairwise_correlation(D, D, ignore_nan=True)
+        # should still be NaN on samples that have no variance
+        assert np.all(np.isnan(C[-1]))
+        assert np.all(np.isnan(C[:,-1]))
+        # but shouldn't be NaN on samples that have some NaNs
+        assert not np.any(np.isnan(C[:2][:,:2]))
+        np.testing.assert_equal(C[:2][:,:2], 1)
+
+    matrix.test_all_matrix_types(
+        D,
+        test_with_ignore_nan,
+    )
+
+
 def shan_entropy(c):
     c_normalized = c / float(np.sum(c))
     return stats.entropy(c_normalized[c_normalized != 0])