Merge 1dec709 into 635eefe

KrishnaswamyLab · Sep 28, 2019 · 543c960 · 543c960
2 parents 635eefe + 1dec709
commit 543c960
Show file tree

Hide file tree

Showing 13 changed files with 303 additions and 7 deletions.
diff --git a/scprep/_lazyload.py b/scprep/_lazyload.py
@@ -14,7 +14,8 @@
              'rinterface',
              {'rinterface_lib': ['callbacks']}],
     'h5py': [],
-    'tables': []
+    'tables': [],
+    'statsmodels': [{'nonparametric': ['smoothers_lowess']}],
 }
 
 

diff --git a/scprep/filter.py b/scprep/filter.py
@@ -369,3 +369,41 @@ def filter_duplicates(data, *extra_data, sample_labels=None):
     unique_idx = _find_unique_cells(data)
     data = select.select_rows(data, *extra_data, idx=unique_idx)
     return data
+
+
+def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2,
+                          cutoff=None, percentile=80):
+    """Filter all genes with low variability
+
+    Variability is computed as the deviation from a loess fit of the mean-variance curve
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_samples, n_features]
+        Input data
+    extra_data : array-like, shape=[any, n_features], optional
+        Optional additional data objects from which to select the same rows
+    span : float, optional (default: 0.7)
+        Fraction of genes to use when computing the loess estimate at each point
+    interpolate : float, optional (default: 0.2)
+        Multiple of the standard deviation of variances at which to interpolate
+        linearly in order to reduce computation time.
+    cutoff : float, optional (default: None)
+        Variability above which expression is deemed significant
+    percentile : int, optional (Default: 80)
+        Percentile above or below which to remove genes.
+        Must be an integer between 0 and 100. Only one of `cutoff`
+        and `percentile` should be specified.
+
+    Returns
+    -------
+    data : array-like, shape=[n_samples, m_features]
+        Filtered output data, where m_features <= n_features
+    extra_data : array-like, shape=[any, m_features]
+        Filtered extra data, if passed.
+    """
+    var_genes = measure.variable_genes(data, span=span, interpolate=interpolate)
+    keep_cells_idx = _get_filter_idx(var_genes,
+                                     cutoff, percentile,
+                                     keep_cells='above')
+    return select.select_cols(data, *extra_data, idx=keep_cells_idx)
diff --git a/scprep/measure.py b/scprep/measure.py
@@ -4,6 +4,7 @@
 import numbers
 
 from . import utils, select
+from ._lazyload import statsmodels
 
 
 def library_size(data):
@@ -67,6 +68,41 @@ def gene_set_expression(data, genes=None, library_size_normalize=False,
     return gene_set_expression
 
 
+@utils._with_pkg(pkg="statsmodels")
+def variable_genes(data, span=0.7, interpolate=0.2):
+    """Measure the variability of each gene in a dataset
+
+    Variability is computed as the deviation from a loess fit of the mean-variance curve
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_samples, n_features]
+        Input data
+    span : float, optional (default: 0.7)
+        Fraction of genes to use when computing the loess estimate at each point
+    interpolate : float, optional (default: 0.2)
+        Multiple of the standard deviation of variances at which to interpolate
+        linearly in order to reduce computation time.
+
+    Returns
+    -------
+    variability : list-like, shape=[n_samples]
+        Variability for each gene
+    """
+    columns = data.columns if isinstance(data, pd.DataFrame) else None
+    data = utils.to_array_or_spmatrix(data)
+    data_std = utils.matrix_std(data, axis=0) ** 2
+    data_mean = utils.toarray(np.mean(data, axis=0)).flatten()
+    delta = np.std(data_std) * interpolate
+    lowess = statsmodels.nonparametric.smoothers_lowess.lowess(
+        data_std, data_mean,
+        delta=delta, frac=span, return_sorted=False)
+    variability = data_std - lowess
+    if columns is not None:
+        variability = pd.Series(variability, index=columns, name='variability')
+    return variability
+
+
 def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False):
     """Get a cutoff for a dataset
 

diff --git a/scprep/plot/__init__.py b/scprep/plot/__init__.py
@@ -1,5 +1,5 @@
 from .scatter import scatter, scatter2d, scatter3d, rotate_scatter3d
-from .histogram import histogram, plot_library_size, plot_gene_set_expression
+from .histogram import histogram, plot_library_size, plot_gene_set_expression, plot_variable_genes
 from .marker import marker_plot
 from .scree import scree_plot
 from .jitter import jitter

diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py
@@ -288,3 +288,83 @@ def plot_gene_set_expression(data, genes=None,
                      bins=bins, log=log, ax=ax, figsize=figsize,
                      xlabel=xlabel, title=title, fontsize=fontsize,
                      filename=filename, dpi=dpi, **kwargs)
+
+
+@utils._with_pkg(pkg="matplotlib", min_version=3)
+def plot_variable_genes(data, span=0.7, interpolate=0.2,
+                        bins=100, log=False,
+                        cutoff=None, percentile=None,
+                        ax=None, figsize=None,
+                        xlabel='Gene variability',
+                        ylabel='Number of genes',
+                        title=None,
+                        fontsize=None,
+                        filename=None,
+                        dpi=None, **kwargs):
+    """Plot the histogram of gene variability
+
+    Variability is computed as the deviation from a loess fit of the mean-variance curve
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_samples, n_features]
+        Input data. Multiple datasets may be given as a list of array-likes.
+    span : float, optional (default: 0.7)
+        Fraction of genes to use when computing the loess estimate at each point
+    interpolate : float, optional (default: 0.2)
+        Multiple of the standard deviation of variances at which to interpolate
+        linearly in order to reduce computation time.
+    bins : int, optional (default: 100)
+        Number of bins to draw in the histogram
+    log : bool, or {'x', 'y'}, optional (default: False)
+        If True, plot both axes on a log scale. If 'x' or 'y',
+        only plot the given axis on a log scale. If False,
+        plot both axes on a linear scale.
+    cutoff : float or `None`, optional (default: `None`)
+        Absolute cutoff at which to draw a vertical line.
+        Only one of `cutoff` and `percentile` may be given.
+    percentile : float or `None`, optional (default: `None`)
+        Percentile between 0 and 100 at which to draw a vertical line.
+        Only one of `cutoff` and `percentile` may be given.
+    library_size_normalize : bool, optional (default: False)
+        Divide gene set expression by library size
+    ax : `matplotlib.Axes` or None, optional (default: None)
+        Axis to plot on. If None, a new axis will be created.
+    figsize : tuple or None, optional (default: None)
+        If not None, sets the figure size (width, height)
+    [x,y]label : str, optional
+        Labels to display on the x and y axis.
+    title : str or None, optional (default: None)
+        Axis title.
+    fontsize : float or None (default: None)
+        Base font size.
+    filename : str or None (default: None)
+        file to which the output is saved
+    dpi : int or None, optional (default: None)
+        The resolution in dots per inch. If None it will default to the value
+        savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi
+        to be the value of the figure. Only used if filename is not None.
+    **kwargs : additional arguments for `matplotlib.pyplot.hist`
+
+    Returns
+    -------
+    ax : `matplotlib.Axes`
+        axis on which plot was drawn
+    """
+    if hasattr(data, 'shape') and len(data.shape) == 2:
+        var_genes = measure.variable_genes(
+            data, span=span, interpolate=interpolate)
+    else:
+        data_array = utils.to_array_or_spmatrix(data)
+        if len(data_array.shape) == 2 and data_array.dtype.type is not np.object_:
+            var_genes = measure.variable_genes(
+                data_array, span=span, interpolate=interpolate)
+        else:
+            var_genes = [measure.variable_genes(
+                d, span=span, interpolate=interpolate)
+                for d in data]
+    return histogram(var_genes,
+                     cutoff=cutoff, percentile=percentile,
+                     bins=bins, log=log, ax=ax, figsize=figsize,
+                     xlabel=xlabel, title=title, fontsize=fontsize,
+                     filename=filename, dpi=dpi, **kwargs)
diff --git a/scprep/plot/scree.py b/scprep/plot/scree.py
@@ -50,8 +50,8 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None,
         if cumulative:
             explained_variance = np.cumsum(explained_variance)
         fig, ax, show_fig = _get_figure(ax, figsize)
-        ax.plot(np.arange(len(explained_variance)),
-                explained_variance, **kwargs)
+        ax.bar(np.arange(len(explained_variance)),
+               explained_variance, **kwargs)
         label_axis(ax.xaxis, label=xlabel)
         label_axis(ax.yaxis, label=ylabel)
         if show_fig:

diff --git a/scprep/plot/utils.py b/scprep/plot/utils.py
@@ -82,8 +82,8 @@ def show(fig):
     fig : matplotlib.Figure
         Figure to show
     """
+    fig.tight_layout()
     if _mpl_is_gui_backend():
-        fig.tight_layout()
         if platform.system() == "Windows":
             plt.show(block=True)
         else:

diff --git a/scprep/utils.py b/scprep/utils.py
@@ -99,7 +99,7 @@ def toarray(x):
     elif isinstance(x, sparse.spmatrix):
         x = x.toarray()
     elif isinstance(x, np.matrix):
-        x = np.array(x)
+        x = x.A
     elif isinstance(x, list):
         x_out = []
         for xi in x:
@@ -224,6 +224,65 @@ def matrix_sum(data, axis=None):
     return sums
 
 
+def matrix_std(data, axis=None):
+    """Get the column-wise, row-wise, or total standard deviation of a matrix
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_samples, n_features]
+        Input data
+    axis : int or None, optional (default: None)
+        Axis across which to calculate standard deviation.
+        axis=0 gives column standard deviation,
+        axis=1 gives row standard deviation.
+        None gives the total standard deviation.
+
+    Returns
+    -------
+    std : array-like or float
+        Standard deviation along desired axis.
+    """
+    if axis not in [0, 1, None]:
+        raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
+    index = None
+    if isinstance(data, pd.DataFrame) and axis is not None:
+        if axis == 1:
+            index = data.index
+        elif axis == 0:
+            index = data.columns
+    data = to_array_or_spmatrix(data)
+    if sparse.issparse(data):
+        if axis is None:
+            if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
+                data = data.tocoo()
+            data_sq = data.copy()
+            data_sq.data = data_sq.data ** 2
+            variance = data_sq.mean() - data.mean() ** 2
+            std = np.sqrt(variance)
+        else:
+            if axis == 0:
+                data = data.tocsc()
+                next_fn = data.getcol
+                N = data.shape[1]
+            elif axis == 1:
+                data = data.tocsr()
+                next_fn = data.getrow
+                N = data.shape[0]
+            std = []
+            for i in range(N):
+                col = next_fn(i)
+                col_sq = col.copy()
+                col_sq.data = col_sq.data ** 2
+                variance = col_sq.mean() - col.mean() ** 2
+                std.append(np.sqrt(variance))
+            std = np.array(std)
+    else:
+        std = np.std(data, axis=axis)
+    if index is not None:
+        std = pd.Series(std, index=index, name='std')
+    return std
+
+
 def matrix_vector_elementwise_multiply(data, multiplier, axis=None):
     """Elementwise multiply a matrix by a vector
 

diff --git a/setup.py b/setup.py
@@ -19,6 +19,7 @@
     'coverage',
     'coveralls',
     'parameterized',
+    'statsmodels',
 ]
 
 doc_requires = [

diff --git a/test/test_filter.py b/test/test_filter.py
@@ -85,6 +85,17 @@ def test_filter_rare_genes(self):
             self.X_dense, utils.assert_transform_equals,
             Y=X_filtered, transform=scprep.filter.filter_rare_genes)
 
+    def test_filter_variable_genes(self):
+        X_filtered = scprep.filter.filter_variable_genes(self.X_dense, percentile=70)
+        assert X_filtered.shape[0] == self.X_dense.shape[0]
+        assert X_filtered.shape[1] <= 30
+        assert X_filtered.shape[1] >= 20
+        assert self.X_dense.columns[np.argmax(self.X_dense.values.std(axis=0))] in X_filtered.columns
+        matrix.test_all_matrix_types(
+            self.X_dense, utils.assert_transform_equals,
+            Y=X_filtered, transform=scprep.filter.filter_variable_genes, percentile=70)
+
+
     def test_library_size_filter(self):
         X_filtered = scprep.filter.filter_library_size(
             self.X_sparse, cutoff=100)

diff --git a/test/test_measure.py b/test/test_measure.py
@@ -60,10 +60,18 @@ def test_fun(X):
         matrix.test_pandas_matrix_types(
             self.X_dense, test_fun)
 
-    def test_library_size(self):
+    def test_gene_set_expression(self):
         def test_fun(X):
             x = scprep.measure.gene_set_expression(X, genes=[0, 1])
             assert x.name == 'expression'
             assert np.all(x.index == self.X_dense.index)
         matrix.test_pandas_matrix_types(
             self.X_dense, test_fun)
+
+    def test_variable_genes(self):
+        def test_fun(X):
+            x = scprep.measure.variable_genes(X)
+            assert x.name == 'variability'
+            assert np.all(x.index == self.X_dense.columns)
+        matrix.test_pandas_matrix_types(
+            self.X_dense, test_fun)
diff --git a/test/test_plot.py b/test/test_plot.py
@@ -742,6 +742,7 @@ def tearDownClass(self):
         try_remove("test_jitter.png")
         try_remove("test_histogram.png")
         try_remove("test_library_size.png")
+        try_remove("test_variable_genes.png")
         try_remove("test_gene_expression.png")
 
     def tearDown(self):
@@ -799,6 +800,23 @@ def test_plot_gene_set_expression_single_gene(self):
             filename="test_gene_expression.png")
         assert os.path.exists("test_gene_expression.png")
 
+    def test_plot_variable_genes(self):
+        scprep.plot.plot_variable_genes(
+            self.X,
+            color='r')
+
+    def test_plot_variable_genes_multiple(self):
+        scprep.plot.plot_variable_genes([
+            self.X, scprep.select.select_rows(
+                self.X, idx=np.arange(self.X.shape[0] // 2))],
+            filename="test_variable_genes.png",
+            color=['r', 'b'])
+        assert os.path.exists("test_variable_genes.png")
+
+    def test_variable_genes_list_of_lists(self):
+        scprep.plot.plot_variable_genes(
+            scprep.utils.toarray(self.X).tolist())
+
     def test_histogram_single_gene_dataframe(self):
         scprep.plot.histogram(
             scprep.select.select_cols(self.X, idx=['Arl8b']),