Merge pull request #90 from KrishnaswamyLab/dev

scprep v1.0.1
KrishnaswamyLab · Oct 14, 2019 · e520220 · e520220
2 parents f0fa702 + 16f8916
commit e520220
Show file tree

Hide file tree

Showing 20 changed files with 995 additions and 149 deletions.
diff --git a/logo.svg b/logo.svg
diff --git a/scprep/_lazyload.py b/scprep/_lazyload.py
@@ -15,7 +15,7 @@
              {'rinterface_lib': ['callbacks']}],
     'h5py': [],
     'tables': [],
-    'statsmodels': [{'nonparametric': ['smoothers_lowess']}],
+    'requests': [],
 }
 
 

diff --git a/scprep/filter.py b/scprep/filter.py
@@ -4,7 +4,9 @@
 import numpy as np
 import pandas as pd
 from scipy import sparse
+
 import warnings
+import numbers
 
 from . import utils, measure, select
 
@@ -85,7 +87,7 @@ def filter_rare_genes(data, *extra_data, cutoff=0, min_cells=5):
     extra_data : array-like, shape=[any, m_features]
         Filtered extra data, if passed.
     """
-    gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1)
+    gene_sums = measure.gene_capture_count(data, cutoff=cutoff)
     keep_genes_idx = gene_sums >= min_cells
     data = select.select_cols(data, *extra_data, idx=keep_genes_idx)
     return data
@@ -139,15 +141,17 @@ def filter_values(data, *extra_data, values=None,
         Optional additional data objects from which to select the same rows
     values : list-like, shape=[n_samples]
         Value upon which to filter
-    cutoff : float, optional (default: None)
-        Minimum library size required to retain a cell. Only one of `cutoff`
+    cutoff : float or tuple of floats, optional (default: None)
+        Value above or below which to retain cells. Only one of `cutoff`
         and `percentile` should be specified.
-    percentile : int, optional (Default: None)
-        Percentile above or below which to remove cells.
+    percentile : int or tuple of ints, optional (Default: None)
+        Percentile above or below which to retain cells.
         Must be an integer between 0 and 100. Only one of `cutoff`
         and `percentile` should be specified.
-    keep_cells : {'above', 'below'}, optional (default: 'above')
-        Keep cells above or below the cutoff
+    keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
+        Keep cells above, below or between the cutoff.
+        If None, defaults to 'above' when a single cutoff is given and
+        'between' when two cutoffs are given.
     return_values : bool, optional (default: False)
         If True, also return the values corresponding to the retained cells
     sample_labels : Deprecated
@@ -183,7 +187,7 @@ def filter_values(data, *extra_data, values=None,
 
 
 def filter_library_size(data, *extra_data, cutoff=None, percentile=None,
-                        keep_cells='above',
+                        keep_cells=None,
                         return_library_size=False,
                         sample_labels=None,
                         filter_per_sample=None):
@@ -198,15 +202,17 @@ def filter_library_size(data, *extra_data, cutoff=None, percentile=None,
         Input data
     extra_data : array-like, shape=[n_samples, any], optional
         Optional additional data objects from which to select the same rows
-    cutoff : float, optional (default: None)
-        Minimum library size required to retain a cell. Only one of `cutoff`
+    cutoff : float or tuple of floats, optional (default: None)
+        Library size above or below which to retain a cell. Only one of `cutoff`
         and `percentile` should be specified.
-    percentile : int, optional (Default: None)
-        Percentile above or below which to remove cells.
+    percentile : int or tuple of ints, optional (Default: None)
+        Percentile above or below which to retain a cell.
         Must be an integer between 0 and 100. Only one of `cutoff`
         and `percentile` should be specified.
-    keep_cells : {'above', 'below'}, optional (default: 'above')
-        Keep cells above or below the cutoff
+    keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
+        Keep cells above, below or between the cutoff.
+        If None, defaults to 'above' when a single cutoff is given and
+        'between' when two cutoffs are given.
     return_library_size : bool, optional (default: False)
         If True, also return the library sizes corresponding to the retained cells
     sample_labels : Deprecated
@@ -236,7 +242,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
                                exact_word=None, regex=None,
                                cutoff=None, percentile=None,
                                library_size_normalize=False,
-                               keep_cells='below',
+                               keep_cells=None,
                                return_expression=False,
                                sample_labels=None,
                                filter_per_sample=None):
@@ -261,17 +267,18 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
         If not None, select genes that contain this exact word.
     regex : str or None, optional (default: None)
         If not None, select genes that match this regular expression
-    cutoff : float, optional (default: 2000)
-        Value above or below which to remove cells. Only one of `cutoff`
+    cutoff : float or tuple of floats, optional (default: None)
+        Expression value above or below which to remove cells. Only one of `cutoff`
         and `percentile` should be specified.
-    percentile : int, optional (Default: None)
-        Percentile above or below which to remove cells.
+    percentile : int or tuple of ints, optional (Default: None)
+        Percentile above or below which to retain a cell.
         Must be an integer between 0 and 100. Only one of `cutoff`
         and `percentile` should be specified.
     library_size_normalize : bool, optional (default: False)
         Divide gene set expression by library size
-    keep_cells : {'above', 'below'}, optional (default: 'below')
-        Keep cells above or below the cutoff
+    keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
+        Keep cells above or below the cutoff. If None, defaults to
+        'below' for one cutoff and 'between' for two.
     return_expression : bool, optional (default: False)
         If True, also return the values corresponding to the retained cells
     sample_labels : Deprecated
@@ -287,6 +294,10 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
     extra_data : array-like, shape=[m_samples, any]
         Filtered extra data, if passed.
     """
+    if keep_cells is None:
+        if isinstance(cutoff, numbers.Number) or \
+            isinstance(percentile, numbers.Number):
+            keep_cells = 'below'
     cell_sums = measure.gene_set_expression(
         data, genes=genes,
         starts_with=starts_with, ends_with=ends_with,

diff --git a/scprep/io/download.py b/scprep/io/download.py
@@ -1,9 +1,11 @@
-import requests
 import zipfile
 import tempfile
 import os
 import urllib.request
 
+from .._lazyload import requests
+from .. import utils
+
 _CHUNK_SIZE = 32768
 _GOOGLE_DRIVE_URL = "https://docs.google.com/uc?export=download"
 _FAKE_HEADERS = [('User-Agent', 'Mozilla/5.0')]
@@ -27,6 +29,7 @@ def _google_drive_confirm_token(response):
     return None
 
 
+@utils._with_pkg(pkg="requests")
 def _GET_google_drive(id):
     """Post a GET request to Google Drive"""
     global _GOOGLE_DRIVE_URL

diff --git a/scprep/io/fcs.py b/scprep/io/fcs.py
@@ -249,10 +249,8 @@ def load_fcs(filename, gene_names=True, cell_names=True,
     channel_naming: '$PnS' | '$PnN'
         Determines which meta data field is used for naming the channels.
         The default should be $PnS (even though it is not guaranteed to be unique)
-        $PnN stands for the short name (guaranteed to be unique).
-            Will look like 'FL1-H'
-        $PnS stands for the actual name (not guaranteed to be unique).
-            Will look like 'FSC-H' (Forward scatter)
+        $PnN stands for the short name (guaranteed to be unique). Will look like 'FL1-H'
+        $PnS stands for the actual name (not guaranteed to be unique). Will look like 'FSC-H' (Forward scatter)
         The chosen field will be used to population self.channels
         Note: These names are not flipped in the implementation.
         It looks like they were swapped for some reason in the official FCS specification.

diff --git a/scprep/measure.py b/scprep/measure.py
@@ -3,9 +3,9 @@
 import warnings
 import numbers
 import scipy.signal
+from scipy import sparse
 
 from . import utils, select
-from ._lazyload import statsmodels
 
 
 def library_size(data):
@@ -69,25 +69,21 @@ def gene_set_expression(data, genes=None, library_size_normalize=False,
     return gene_set_expression
 
 
-@utils._with_pkg(pkg="statsmodels")
-def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_means=False):
+def gene_variability(data, kernel_size=0.005, smooth=5, return_means=False):
     """Measure the variability of each gene in a dataset
 
-    Variability is computed as the deviation from a loess fit
-    to the rolling median of the mean-variance curve
+    Variability is computed as the deviation from
+    the rolling median of the mean-variance curve
 
     Parameters
     ----------
     data : array-like, shape=[n_samples, n_features]
         Input data
-    span : float, optional (default: 0.7)
-        Fraction of genes to use when computing the loess estimate at each point
-    interpolate : float, optional (default: 0.2)
-        Multiple of the standard deviation of variances at which to interpolate
-        linearly in order to reduce computation time.
-    kernel_size : float or int, optional (default: 0.05)
+    kernel_size : float or int, optional (default: 0.005)
         Width of rolling median window. If a float between 0 and 1, the width is given by
         kernel_size * data.shape[1]. Otherwise should be an odd integer
+    smooth : int, optional (default: 5)
+        Amount of smoothing to apply to the median filter
     return_means : boolean, optional (default: False)
         If True, return the gene means
 
@@ -98,21 +94,52 @@ def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_m
     """
     columns = data.columns if isinstance(data, pd.DataFrame) else None
     data = utils.to_array_or_spmatrix(data)
+    if isinstance(data, sparse.dia_matrix):
+        data = data.tocsc()
     data_std = utils.matrix_std(data, axis=0) ** 2
+    data_mean = utils.toarray(np.mean(data, axis=0)).flatten()
+
     if kernel_size < 1:
         kernel_size = 2*(int(kernel_size * len(data_std))//2)+1
-    order = np.argsort(data_std)
+
+    order = np.argsort(data_mean)
     data_std_med = np.empty_like(data_std)
-    data_std_med[order] = scipy.signal.medfilt(data_std[order], kernel_size=kernel_size)
-    data_mean = utils.toarray(np.mean(data, axis=0)).flatten()
-    delta = np.std(data_std_med) * interpolate
-    lowess = statsmodels.nonparametric.smoothers_lowess.lowess(
-        data_std_med, data_mean,
-        delta=delta, frac=span, return_sorted=False)
-    result = data_std - lowess
+    data_std_order = data_std[order]
+    # handle overhang with reflection
+    data_std_order = np.r_[data_std_order[kernel_size::-1], data_std_order, data_std_order[:-kernel_size:-1]]
+    medfilt = scipy.signal.medfilt(data_std_order, kernel_size=kernel_size)[kernel_size:-kernel_size]
+
+    # apply a little smoothing
+    for i in range(smooth):
+        medfilt = np.r_[(medfilt[1:] + medfilt[:-1])/2, medfilt[-1]]
+
+    data_std_med[order] = medfilt
+    result = data_std - data_std_med
+
     if columns is not None:
         result = pd.Series(result, index=columns, name='variability')
         data_mean = pd.Series(data_mean, index=columns, name='mean')
     if return_means:
         result = result, data_mean
     return result
+
+
+def gene_capture_count(data, cutoff=0):
+    """Measure the number of cells in which each gene has non-negligible counts
+
+    Parameters
+    ----------
+    data : array-like, shape=[n_samples, n_features]
+        Input data
+    cutoff : float, optional (default: 0)
+        Number of counts above which expression is deemed non-negligible
+
+    Returns
+    -------
+    capture-count : list-like, shape=[m_features]
+        Capture count for each gene
+    """
+    gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1)
+    if isinstance(data, pd.DataFrame):
+        gene_sums = pd.Series(gene_sums, index=data.columns, name='capture_count')
+    return gene_sums
diff --git a/scprep/normalize.py b/scprep/normalize.py
@@ -10,7 +10,7 @@
 from . import measure, utils
 
 
-def _get_scaled_libsize(data, rescale='median', return_library_size=False):
+def _get_scaled_libsize(data, rescale=10000, return_library_size=False):
     if return_library_size or rescale in ['median', 'mean']:
         libsize = measure.library_size(data)
     else:
@@ -98,6 +98,7 @@ def library_size_normalize(data, rescale=10000,
             data_norm = pd.DataFrame(data_norm)
         data_norm.columns = columns
         data_norm.index = index
+        libsize = pd.Series(libsize, index=index, name='library_size')
     if return_library_size:
         return data_norm, libsize
     else:

diff --git a/scprep/plot/histogram.py b/scprep/plot/histogram.py
@@ -7,6 +7,53 @@
                     temp_fontsize, parse_fontsize)
 from .tools import label_axis
 
+_EPS = np.finfo('float').eps
+
+def _log_bins(xmin, xmax, bins):
+    if xmin > xmax:
+        return np.array([xmax])
+    xmin = np.log10(xmin)
+    xmax = np.log10(xmax)
+    xrange = max(xmax - xmin, 1)
+    xmin = max(xmin - xrange * 0.1, np.log10(_EPS))
+    xmax = xmax + xrange * 0.1
+    return np.logspace(xmin, xmax, bins + 1)
+
+
+def _symlog_bins(xmin, xmax, abs_min, bins):
+    if xmin > 0:
+        bins = _log_bins(xmin, xmax, bins)
+    elif xmax < 0:
+        bins = -1 * _log_bins(-xmax, -xmin, bins)[::-1]
+    else:
+        # symlog
+        bins = max(bins, 3)
+        if xmax > 0 and xmin < 0:
+            bins = max(bins, 3)
+            neg_range = np.log(-xmin) - np.log(abs_min)
+            pos_range = np.log(xmax) - np.log(abs_min)
+            total_range = pos_range + neg_range
+            if total_range > 0:
+                n_pos_bins = np.round((bins-1) * pos_range / (pos_range + neg_range)).astype(int)
+            else:
+                n_pos_bins = 1
+            n_neg_bins = max(bins - n_pos_bins - 1, 1)
+        elif xmax > 0:
+            bins = max(bins, 2)
+            n_pos_bins = bins - 1
+            n_neg_bins = 0
+        elif xmin < 0:
+            bins = max(bins, 2)
+            n_neg_bins = bins - 1
+            n_pos_bins = 0
+        else:
+            # identically zero
+            return np.array([-1, -0.1, 0.1, 1])
+        pos_bins = _log_bins(abs_min, xmax, n_pos_bins)
+        neg_bins = -1 * _log_bins(abs_min, -xmin, n_neg_bins)[::-1]
+        bins = np.concatenate([neg_bins, pos_bins])
+    return bins
+
 
 @utils._with_pkg(pkg="matplotlib", min_version=3)
 def histogram(data,
@@ -87,25 +134,15 @@ def histogram(data,
             if alpha is None:
                 alpha = 1
         if log == 'x' or log is True:
-            if xmax < np.finfo('float').eps:
-                raise ValueError("Expected positive data for log = {}. "
-                                 "Got max(data) = {:.2f}".format(log, xmax))
-            elif xmin < np.finfo('float').eps:
-                warnings.warn("Expected positive data for log = {}. "
-                              "Got min(data) = {:.2f}".format(log, xmin), UserWarning)
-                xmin = np.finfo('float').eps
-            xmin = np.log10(xmin)
-            xmax = np.log10(xmax)
-            xrange = max(xmax - xmin, 1)
-            xmin = xmin - xrange * 0.1
-            xmax = xmax + xrange * 0.1
-            bins = np.logspace(xmin,
-                               xmax,
-                               bins)
+            d_flat = np.concatenate(data) if isinstance(data, list) else data
+            abs_min = np.min(np.where(d_flat != 0, np.abs(d_flat), np.max(np.abs(d_flat))))
+            if abs_min == 0:
+                abs_min = 0.1
+            bins = _symlog_bins(xmin, xmax, abs_min, bins=bins)
         ax.hist(data, bins=bins, histtype=histtype, alpha=alpha, **kwargs)
 
         if log == 'x' or log is True:
-            ax.set_xscale('log')
+            ax.set_xscale('symlog', linthreshx=abs_min)
         if log == 'y' or log is True:
             ax.set_yscale('log')
 

diff --git a/scprep/plot/tools.py b/scprep/plot/tools.py
@@ -124,7 +124,7 @@ def generate_legend(cmap, ax, title=None, marker='o', markersize=10,
                                 markersize=markersize)
                for label, color in cmap.items()]
     if ncol is None:
-        ncol = max(1, len(cmap) // max_rows)
+        ncol = max(1, np.ceil(len(cmap) / max_rows).astype(int))
     legend = ax.legend(handles=handles, title=title,
                        loc=loc, bbox_to_anchor=bbox_to_anchor,
                        fontsize=fontsize, ncol=ncol, **kwargs)