Merge 0be7b6c into a6aad79

KrishnaswamyLab · Sep 15, 2019 · 07b8c8d · 07b8c8d
2 parents a6aad79 + 0be7b6c
commit 07b8c8d
Show file tree

Hide file tree

Showing 4 changed files with 308 additions and 23 deletions.
diff --git a/scprep/stats.py b/scprep/stats.py
@@ -3,9 +3,11 @@
 
 import numbers
 import numpy as np
+import pandas as pd
 from scipy import stats, sparse
 from sklearn import neighbors, metrics
-from . import plot, utils
+import joblib
+from . import plot, utils, select
 import warnings
 
 from ._lazyload import matplotlib
@@ -363,6 +365,154 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh,
     plot.utils.show(fig)
 
 
+def mean_difference(X, Y):
+    """Calculate the mean difference in genes between two datasets
+
+    In the case where the data has been log normalized,
+    this is equivalent to fold change.
+
+    Parameters
+    ----------
+    X : array-like, shape=[n_cells, n_genes]
+    Y : array-like, shape=[m_cells, n_genes]
+
+    Returns
+    -------
+    difference : list-like, shape=[n_genes]
+    """
+    if not X.shape[1] == Y.shape[1]:
+        raise ValueError("Expected X and Y to have the same number of columns. "
+                         "Got shapes {}, {}".format(X.shape, Y.shape))
+    X = utils.to_array_or_spmatrix(X)
+    Y = utils.to_array_or_spmatrix(Y)
+    X = utils.toarray(X.mean(axis=0)).flatten()
+    Y = utils.toarray(Y.mean(axis=0)).flatten()
+    return X - Y
+
+
+def differential_expression(X, Y,
+                            measure='difference',
+                            direction='up',
+                            gene_names=None,
+                            n_jobs=-2):
+    """Calculate the most significant genes between two datasets
+
+    Parameters
+    ----------
+    X : array-like, shape=[n_cells, n_genes]
+    Y : array-like, shape=[m_cells, n_genes]
+    measure : {'difference', 'emd'}, optional (default: 'difference')
+        The measurement to be used to rank genes
+    direction : {'up', 'down', 'both'}, optional (default: 'up')
+        The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value.
+    gene_names : list-like or `None`, optional (default: `None`)
+        List of gene names associated with the columns of X and Y
+    n_jobs : int, optional (default: -2)
+        Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores.
+
+    Returns
+    -------
+    result : pd.DataFrame
+        Ordered DataFrame with a column "gene" and a column named `measure`.
+    """
+    if not direction in ['up', 'down', 'both']:
+        raise ValueError("Expected `direction` in ['up', 'down', 'both']. "
+                         "Got {}".format(direction))
+    if not measure in ['difference', 'emd']:
+        raise ValueError("Expected `measure` in ['difference', 'emd']. "
+                         "Got {}".format(measure))
+    if not (len(X.shape) == 2 and len(Y.shape) == 2):
+        raise ValueError("Expected `X` and `Y` to be matrices. "
+                         "Got shapes {}, {}".format(X.shape, Y.shape))
+    [X, Y] = utils.check_consistent_columns([X, Y])
+    if gene_names is not None:
+        if isinstance(X, pd.DataFrame):
+            X = select.select_cols(X, idx=gene_names)
+            gene_names = X.columns
+        if isinstance(Y, pd.DataFrame):
+            Y = select.select_cols(Y, idx=gene_names)
+            gene_names = Y.columns
+        if not len(gene_names) == X.shape[1]:
+            raise ValueError("Expected gene_names to have length {}. "
+                             "Got {}".format(X.shape[1], len(gene_names)))
+    else:
+        if isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame):
+            gene_names = X.columns
+        else:
+            gene_names = np.arange(X.shape[1])
+    X = utils.to_array_or_spmatrix(X)
+    Y = utils.to_array_or_spmatrix(Y)
+    # inconsistent behaviour from csr and csc
+    if sparse.issparse(X):
+        X = X.tocsr()
+    if sparse.issparse(Y):
+        Y = Y.tocsr()
+    if measure == 'difference':
+        difference = mean_difference(X, Y)
+    elif measure == 'emd':
+        difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)(
+            select.select_cols(X, idx=i),
+            select.select_cols(Y, idx=i))
+                                             for i in range(X.shape[1]))
+        difference = np.array(difference) * np.sign(mean_difference(X, Y))
+    result = pd.DataFrame({'gene' : gene_names, measure : difference})
+    if direction == 'up':
+        result = result.sort_values([measure, 'gene'], ascending=False)
+    elif direction == 'down':
+        result = result.sort_values([measure, 'gene'], ascending=True)
+    elif direction == 'both':
+        result['measure_abs'] = np.abs(difference)
+        result = result.sort_values(['measure_abs', 'gene'], ascending=False)
+        del result['measure_abs']
+    result.index = np.arange(result.shape[0])
+    return result
+
+
+def differential_expression_by_cluster(data, clusters,
+                                       measure='difference',
+                                       direction='up',
+                                       gene_names=None,
+                                       n_jobs=-2):
+    """Calculate the most significant genes for each cluster in a dataset
+
+    Measurements are run for each cluster against the rest of the dataset.
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_cells, n_genes]
+    clusters : list-like, shape=[n_cells]
+    measure : {'difference', 'emd'}, optional (default: 'difference')
+        The measurement to be used to rank genes
+    direction : {'up', 'down', 'both'}, optional (default: 'up')
+        The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value.
+    gene_names : list-like or `None`, optional (default: `None`)
+        List of gene names associated with the columns of X and Y
+    n_jobs : int, optional (default: -2)
+        Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores.
+
+    Returns
+    -------
+    result : dict(pd.DataFrame)
+        Dictionary containing an ordered DataFrame with a column "gene" and a column named `measure` for each cluster.
+    """
+    if gene_names is not None and isinstance(data, pd.DataFrame):
+        data = select.select_cols(data, idx=gene_names)
+        gene_names = data.columns
+    if gene_names is None:
+        if isinstance(data, pd.DataFrame):
+            gene_names = data.columns
+    elif not len(gene_names) == data.shape[1]:
+        raise ValueError("Expected gene_names to have length {}. "
+                         "Got {}".format(data.shape[1], len(gene_names)))
+    data = utils.to_array_or_spmatrix(data)
+    result = {cluster : differential_expression(
+        select.select_rows(data, idx=clusters==cluster),
+        select.select_rows(data, idx=clusters!=cluster),
+        measure = measure, direction = direction,
+        gene_names = gene_names, n_jobs = n_jobs)
+              for cluster in np.unique(clusters)}
+    return result
+
 def _vector_coerce_dense(x):
     x = utils.toarray(x)
     x_1d = x.flatten()
@@ -381,5 +531,5 @@ def _vector_coerce_two_dense(x, y):
             raise ValueError("Expected x and y to be 1D arrays. "
                              "Got shapes x {}, y {}".format(x.shape, y.shape))
         else:
-            raise
+            raise e
     return x, y
diff --git a/scprep/utils.py b/scprep/utils.py
@@ -364,6 +364,48 @@ def matrix_any(condition):
     return np.sum(np.sum(condition)) > 0
 
 
+def check_consistent_columns(data):
+    """Ensure that a set of data matrices have consistent columns
+
+    Parameters
+    ----------
+    data : list of array-likes
+        List of matrices to be checked
+
+    Returns
+    -------
+    data : list of array-likes
+        List of matrices with consistent columns, subsetted if necessary
+
+    Raises
+    ------
+    ValueError
+        Raised if data has inconsistent number of columns and does not
+        have column names for subsetting
+    """
+    matrix_type = type(data[0])
+    matrix_shape = data[0].shape[1]
+    if issubclass(matrix_type, pd.DataFrame):
+        if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and
+                np.all([data[0].columns == d.columns for d in data])):
+            common_genes = data[0].columns.values
+            for d in data[1:]:
+                common_genes = common_genes[np.isin(common_genes,
+                                                    d.columns.values)]
+            for i in range(len(data)):
+                data[i] = data[i][common_genes]
+            warnings.warn("Input data has inconsistent column names. "
+                          "Subsetting to {} common columns.".format(
+                              len(common_genes)), UserWarning)
+    else:
+        for d in data[1:]:
+            if not d.shape[1] == matrix_shape:
+                shapes = ", ".join([str(d.shape[1]) for d in data])
+                raise ValueError("Expected data all with the same number of "
+                                 "columns. Got {}".format(shapes))
+    return data
+
+
 def combine_batches(data, batch_labels, append_to_cell_names=None):
     """Combine data matrices from multiple batches and store a batch label
 
@@ -405,26 +447,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=None):
             raise TypeError("Expected data all of the same class. "
                             "Got {}".format(types))
 
-    # check consistent columns
-    matrix_shape = data[0].shape[1]
-    if issubclass(matrix_type, pd.DataFrame):
-        if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and
-                np.all([data[0].columns == d.columns for d in data])):
-            common_genes = data[0].columns.values
-            for d in data[1:]:
-                common_genes = common_genes[np.isin(common_genes,
-                                                    d.columns.values)]
-            for i in range(len(data)):
-                data[i] = data[i][common_genes]
-            warnings.warn("Input data has inconsistent column names. "
-                          "Subsetting to {} common columns.".format(
-                              len(common_genes)), UserWarning)
-    else:
-        for d in data[1:]:
-            if not d.shape[1] == matrix_shape:
-                shapes = ", ".join([str(d.shape[1]) for d in data])
-                raise ValueError("Expected data all with the same number of "
-                                 "columns. Got {}".format(shapes))
+    data = check_consistent_columns(data)
 
     # check append_to_cell_names
     if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame):

diff --git a/setup.py b/setup.py
@@ -18,7 +18,8 @@
     'h5py',
     'rpy2>=3.0',
     'coverage',
-    'coveralls'
+    'coveralls',
+    'parameterized',
 ]
 
 doc_requires = [

diff --git a/test/test_stats.py b/test/test_stats.py
@@ -6,6 +6,7 @@
 import scprep
 from functools import partial
 import warnings
+from parameterized import parameterized
 
 
 def _test_fun_2d(X, fun, **kwargs):
@@ -121,3 +122,113 @@ def test_knnDREMI():
         "Attempting to calculate kNN-DREMI on a constant array. "
         "Returning `0`", scprep.stats.knnDREMI, X[:, 0],
         np.zeros_like(X[:, 1]))
+
+
+def test_mean_difference():
+    X = data.load_10X()
+    X = scprep.filter.filter_empty_genes(X)
+    Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100])
+    assert np.allclose(np.max(Y), 16.8125)
+    assert np.allclose(np.min(Y), -0.5625)
+    def test_fun(X, **kwargs):
+        return scprep.stats.mean_difference(
+            scprep.select.select_rows(X, idx=np.arange(20)),
+            scprep.select.select_rows(X, idx=np.arange(20, 100)),
+        **kwargs)
+    matrix.test_all_matrix_types(
+        X, utils.assert_transform_equals, Y=Y,
+        transform=test_fun,
+        check=utils.assert_all_close)
+    assert_raise_message(
+        ValueError,
+        "Expected X and Y to have the same number of columns. "
+        "Got shapes {}, {}".format(X.shape, X.iloc[:,:10].shape),
+        scprep.stats.mean_difference,
+        X, X.iloc[:,:10])
+
+
+@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'),
+               ('emd', 'up'), ('emd', 'down'), ('emd', 'both')])
+def test_differential_expression(measure, direction):
+    X = data.load_10X()
+    X = scprep.filter.filter_empty_genes(X)
+    result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100],
+                                                 measure=measure, direction=direction)
+    expected_results = {('difference', 'up') : ('Gstm5', 16.8125),
+                        ('difference', 'down') : ('Slc2a3', -0.5625),
+                        ('difference', 'both') : ('Gstm5', 16.8125),
+                        ('emd', 'up') : ('Gstm5', 17.5625),
+                        ('emd', 'down') : ('Slc2a3', -0.6875),
+                        ('emd', 'both') : ('Gstm5', 17.5625)}
+    assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0]
+    assert np.allclose(result[measure][0],
+                       expected_results[(measure, direction)][1])
+    def test_fun(X, **kwargs):
+        return scprep.stats.differential_expression(
+            scprep.select.select_rows(X, idx=np.arange(20)),
+            scprep.select.select_rows(X, idx=np.arange(20, 100)),
+        **kwargs)
+    def check_fun(Y1, Y2):
+        if direction == 'both':
+            Y1[measure] = np.abs(Y1[measure])
+            Y2[measure] = np.abs(Y2[measure])
+        np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4)
+        Y1 = Y1.sort_values('gene')
+        Y2 = Y2.sort_values('gene')
+        np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4)
+    matrix.test_all_matrix_types(
+        X, utils.assert_transform_equals, Y=result,
+        transform=test_fun,
+        check=check_fun,
+        gene_names=X.columns,
+        measure=measure, direction=direction)
+
+
+def test_differential_expression_error():
+    X = data.load_10X()
+    assert_raise_message(
+        ValueError, "Expected `direction` in ['up', 'down', 'both']. "
+        "Got invalid", scprep.stats.differential_expression,
+        X, X, direction='invalid')
+    assert_raise_message(
+        ValueError, "Expected `measure` in ['difference', 'emd']. "
+        "Got invalid", scprep.stats.differential_expression,
+        X, X, measure='invalid')
+    assert_raise_message(
+        ValueError, "Expected `X` and `Y` to be matrices. "
+        "Got shapes {}, {}".format(X.shape, X.iloc[0].shape),
+        scprep.stats.differential_expression,
+        X, X.iloc[0])
+    assert_raise_message(
+        ValueError, "Expected gene_names to have length {}. "
+        "Got {}".format(X.shape[0], X.shape[0]//2),
+        scprep.stats.differential_expression,
+        X.to_coo(), X.to_coo(), gene_names=np.arange(X.shape[0]//2))
+    assert_raise_message(
+        ValueError, "Expected gene_names to have length {}. "
+        "Got {}".format(X.shape[0], X.shape[0]//2),
+        scprep.stats.differential_expression_by_cluster,
+        X.to_coo(), np.random.choice(2, X.shape[0], replace=True),
+        gene_names=np.arange(X.shape[0]//2))
+    assert_warns_message(
+        UserWarning, "Input data has inconsistent column names. "
+        "Subsetting to 20 common columns.",
+        scprep.stats.differential_expression,
+        X, X.iloc[:,:20])
+
+
+@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'),
+               ('emd', 'up'), ('emd', 'down'), ('emd', 'both')])
+def test_differential_expression_by_cluster(measure, direction):
+    X = data.load_10X()
+    np.random.seed(42)
+    clusters = np.random.choice(4, X.shape[0], replace=True)
+    result = scprep.stats.differential_expression_by_cluster(
+        X, clusters,
+        measure=measure, direction=direction)
+    for cluster in range(4):
+        r = scprep.stats.differential_expression(
+            scprep.select.select_rows(X, idx=clusters==cluster),
+            scprep.select.select_rows(X, idx=clusters!=cluster),
+        measure=measure, direction=direction)
+        assert np.all(result[cluster] == r)