Skip to content

Commit

Permalink
Merge pull request #90 from KrishnaswamyLab/dev
Browse files Browse the repository at this point in the history
scprep v1.0.1
  • Loading branch information
scottgigante committed Oct 14, 2019
2 parents f0fa702 + 16f8916 commit e520220
Show file tree
Hide file tree
Showing 20 changed files with 995 additions and 149 deletions.
495 changes: 495 additions & 0 deletions logo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion scprep/_lazyload.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
{'rinterface_lib': ['callbacks']}],
'h5py': [],
'tables': [],
'statsmodels': [{'nonparametric': ['smoothers_lowess']}],
'requests': [],
}


Expand Down
53 changes: 32 additions & 21 deletions scprep/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import numpy as np
import pandas as pd
from scipy import sparse

import warnings
import numbers

from . import utils, measure, select

Expand Down Expand Up @@ -85,7 +87,7 @@ def filter_rare_genes(data, *extra_data, cutoff=0, min_cells=5):
extra_data : array-like, shape=[any, m_features]
Filtered extra data, if passed.
"""
gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1)
gene_sums = measure.gene_capture_count(data, cutoff=cutoff)
keep_genes_idx = gene_sums >= min_cells
data = select.select_cols(data, *extra_data, idx=keep_genes_idx)
return data
Expand Down Expand Up @@ -139,15 +141,17 @@ def filter_values(data, *extra_data, values=None,
Optional additional data objects from which to select the same rows
values : list-like, shape=[n_samples]
Value upon which to filter
cutoff : float, optional (default: None)
Minimum library size required to retain a cell. Only one of `cutoff`
cutoff : float or tuple of floats, optional (default: None)
Value above or below which to retain cells. Only one of `cutoff`
and `percentile` should be specified.
percentile : int, optional (Default: None)
Percentile above or below which to remove cells.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain cells.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
keep_cells : {'above', 'below'}, optional (default: 'above')
Keep cells above or below the cutoff
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above, below or between the cutoff.
If None, defaults to 'above' when a single cutoff is given and
'between' when two cutoffs are given.
return_values : bool, optional (default: False)
If True, also return the values corresponding to the retained cells
sample_labels : Deprecated
Expand Down Expand Up @@ -183,7 +187,7 @@ def filter_values(data, *extra_data, values=None,


def filter_library_size(data, *extra_data, cutoff=None, percentile=None,
keep_cells='above',
keep_cells=None,
return_library_size=False,
sample_labels=None,
filter_per_sample=None):
Expand All @@ -198,15 +202,17 @@ def filter_library_size(data, *extra_data, cutoff=None, percentile=None,
Input data
extra_data : array-like, shape=[n_samples, any], optional
Optional additional data objects from which to select the same rows
cutoff : float, optional (default: None)
Minimum library size required to retain a cell. Only one of `cutoff`
cutoff : float or tuple of floats, optional (default: None)
Library size above or below which to retain a cell. Only one of `cutoff`
and `percentile` should be specified.
percentile : int, optional (Default: None)
Percentile above or below which to remove cells.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain a cell.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
keep_cells : {'above', 'below'}, optional (default: 'above')
Keep cells above or below the cutoff
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above, below or between the cutoff.
If None, defaults to 'above' when a single cutoff is given and
'between' when two cutoffs are given.
return_library_size : bool, optional (default: False)
If True, also return the library sizes corresponding to the retained cells
sample_labels : Deprecated
Expand Down Expand Up @@ -236,7 +242,7 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
exact_word=None, regex=None,
cutoff=None, percentile=None,
library_size_normalize=False,
keep_cells='below',
keep_cells=None,
return_expression=False,
sample_labels=None,
filter_per_sample=None):
Expand All @@ -261,17 +267,18 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
If not None, select genes that contain this exact word.
regex : str or None, optional (default: None)
If not None, select genes that match this regular expression
cutoff : float, optional (default: 2000)
Value above or below which to remove cells. Only one of `cutoff`
cutoff : float or tuple of floats, optional (default: None)
Expression value above or below which to remove cells. Only one of `cutoff`
and `percentile` should be specified.
percentile : int, optional (Default: None)
Percentile above or below which to remove cells.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain a cell.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
library_size_normalize : bool, optional (default: False)
Divide gene set expression by library size
keep_cells : {'above', 'below'}, optional (default: 'below')
Keep cells above or below the cutoff
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above or below the cutoff. If None, defaults to
'below' for one cutoff and 'between' for two.
return_expression : bool, optional (default: False)
If True, also return the values corresponding to the retained cells
sample_labels : Deprecated
Expand All @@ -287,6 +294,10 @@ def filter_gene_set_expression(data, *extra_data, genes=None,
extra_data : array-like, shape=[m_samples, any]
Filtered extra data, if passed.
"""
if keep_cells is None:
if isinstance(cutoff, numbers.Number) or \
isinstance(percentile, numbers.Number):
keep_cells = 'below'
cell_sums = measure.gene_set_expression(
data, genes=genes,
starts_with=starts_with, ends_with=ends_with,
Expand Down
5 changes: 4 additions & 1 deletion scprep/io/download.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import requests
import zipfile
import tempfile
import os
import urllib.request

from .._lazyload import requests
from .. import utils

_CHUNK_SIZE = 32768
_GOOGLE_DRIVE_URL = "https://docs.google.com/uc?export=download"
_FAKE_HEADERS = [('User-Agent', 'Mozilla/5.0')]
Expand All @@ -27,6 +29,7 @@ def _google_drive_confirm_token(response):
return None


@utils._with_pkg(pkg="requests")
def _GET_google_drive(id):
"""Post a GET request to Google Drive"""
global _GOOGLE_DRIVE_URL
Expand Down
6 changes: 2 additions & 4 deletions scprep/io/fcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,8 @@ def load_fcs(filename, gene_names=True, cell_names=True,
channel_naming: '$PnS' | '$PnN'
Determines which meta data field is used for naming the channels.
The default should be $PnS (even though it is not guaranteed to be unique)
$PnN stands for the short name (guaranteed to be unique).
Will look like 'FL1-H'
$PnS stands for the actual name (not guaranteed to be unique).
Will look like 'FSC-H' (Forward scatter)
$PnN stands for the short name (guaranteed to be unique). Will look like 'FL1-H'
$PnS stands for the actual name (not guaranteed to be unique). Will look like 'FSC-H' (Forward scatter)
The chosen field will be used to population self.channels
Note: These names are not flipped in the implementation.
It looks like they were swapped for some reason in the official FCS specification.
Expand Down
65 changes: 46 additions & 19 deletions scprep/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
import warnings
import numbers
import scipy.signal
from scipy import sparse

from . import utils, select
from ._lazyload import statsmodels


def library_size(data):
Expand Down Expand Up @@ -69,25 +69,21 @@ def gene_set_expression(data, genes=None, library_size_normalize=False,
return gene_set_expression


@utils._with_pkg(pkg="statsmodels")
def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_means=False):
def gene_variability(data, kernel_size=0.005, smooth=5, return_means=False):
"""Measure the variability of each gene in a dataset
Variability is computed as the deviation from a loess fit
to the rolling median of the mean-variance curve
Variability is computed as the deviation from
the rolling median of the mean-variance curve
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
span : float, optional (default: 0.7)
Fraction of genes to use when computing the loess estimate at each point
interpolate : float, optional (default: 0.2)
Multiple of the standard deviation of variances at which to interpolate
linearly in order to reduce computation time.
kernel_size : float or int, optional (default: 0.05)
kernel_size : float or int, optional (default: 0.005)
Width of rolling median window. If a float between 0 and 1, the width is given by
kernel_size * data.shape[1]. Otherwise should be an odd integer
smooth : int, optional (default: 5)
Amount of smoothing to apply to the median filter
return_means : boolean, optional (default: False)
If True, return the gene means
Expand All @@ -98,21 +94,52 @@ def gene_variability(data, span=0.7, interpolate=0.2, kernel_size=0.05, return_m
"""
columns = data.columns if isinstance(data, pd.DataFrame) else None
data = utils.to_array_or_spmatrix(data)
if isinstance(data, sparse.dia_matrix):
data = data.tocsc()
data_std = utils.matrix_std(data, axis=0) ** 2
data_mean = utils.toarray(np.mean(data, axis=0)).flatten()

if kernel_size < 1:
kernel_size = 2*(int(kernel_size * len(data_std))//2)+1
order = np.argsort(data_std)

order = np.argsort(data_mean)
data_std_med = np.empty_like(data_std)
data_std_med[order] = scipy.signal.medfilt(data_std[order], kernel_size=kernel_size)
data_mean = utils.toarray(np.mean(data, axis=0)).flatten()
delta = np.std(data_std_med) * interpolate
lowess = statsmodels.nonparametric.smoothers_lowess.lowess(
data_std_med, data_mean,
delta=delta, frac=span, return_sorted=False)
result = data_std - lowess
data_std_order = data_std[order]
# handle overhang with reflection
data_std_order = np.r_[data_std_order[kernel_size::-1], data_std_order, data_std_order[:-kernel_size:-1]]
medfilt = scipy.signal.medfilt(data_std_order, kernel_size=kernel_size)[kernel_size:-kernel_size]

# apply a little smoothing
for i in range(smooth):
medfilt = np.r_[(medfilt[1:] + medfilt[:-1])/2, medfilt[-1]]

data_std_med[order] = medfilt
result = data_std - data_std_med

if columns is not None:
result = pd.Series(result, index=columns, name='variability')
data_mean = pd.Series(data_mean, index=columns, name='mean')
if return_means:
result = result, data_mean
return result


def gene_capture_count(data, cutoff=0):
"""Measure the number of cells in which each gene has non-negligible counts
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
cutoff : float, optional (default: 0)
Number of counts above which expression is deemed non-negligible
Returns
-------
capture-count : list-like, shape=[m_features]
Capture count for each gene
"""
gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1)
if isinstance(data, pd.DataFrame):
gene_sums = pd.Series(gene_sums, index=data.columns, name='capture_count')
return gene_sums
3 changes: 2 additions & 1 deletion scprep/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from . import measure, utils


def _get_scaled_libsize(data, rescale='median', return_library_size=False):
def _get_scaled_libsize(data, rescale=10000, return_library_size=False):
if return_library_size or rescale in ['median', 'mean']:
libsize = measure.library_size(data)
else:
Expand Down Expand Up @@ -98,6 +98,7 @@ def library_size_normalize(data, rescale=10000,
data_norm = pd.DataFrame(data_norm)
data_norm.columns = columns
data_norm.index = index
libsize = pd.Series(libsize, index=index, name='library_size')
if return_library_size:
return data_norm, libsize
else:
Expand Down
69 changes: 53 additions & 16 deletions scprep/plot/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,53 @@
temp_fontsize, parse_fontsize)
from .tools import label_axis

_EPS = np.finfo('float').eps

def _log_bins(xmin, xmax, bins):
if xmin > xmax:
return np.array([xmax])
xmin = np.log10(xmin)
xmax = np.log10(xmax)
xrange = max(xmax - xmin, 1)
xmin = max(xmin - xrange * 0.1, np.log10(_EPS))
xmax = xmax + xrange * 0.1
return np.logspace(xmin, xmax, bins + 1)


def _symlog_bins(xmin, xmax, abs_min, bins):
if xmin > 0:
bins = _log_bins(xmin, xmax, bins)
elif xmax < 0:
bins = -1 * _log_bins(-xmax, -xmin, bins)[::-1]
else:
# symlog
bins = max(bins, 3)
if xmax > 0 and xmin < 0:
bins = max(bins, 3)
neg_range = np.log(-xmin) - np.log(abs_min)
pos_range = np.log(xmax) - np.log(abs_min)
total_range = pos_range + neg_range
if total_range > 0:
n_pos_bins = np.round((bins-1) * pos_range / (pos_range + neg_range)).astype(int)
else:
n_pos_bins = 1
n_neg_bins = max(bins - n_pos_bins - 1, 1)
elif xmax > 0:
bins = max(bins, 2)
n_pos_bins = bins - 1
n_neg_bins = 0
elif xmin < 0:
bins = max(bins, 2)
n_neg_bins = bins - 1
n_pos_bins = 0
else:
# identically zero
return np.array([-1, -0.1, 0.1, 1])
pos_bins = _log_bins(abs_min, xmax, n_pos_bins)
neg_bins = -1 * _log_bins(abs_min, -xmin, n_neg_bins)[::-1]
bins = np.concatenate([neg_bins, pos_bins])
return bins


@utils._with_pkg(pkg="matplotlib", min_version=3)
def histogram(data,
Expand Down Expand Up @@ -87,25 +134,15 @@ def histogram(data,
if alpha is None:
alpha = 1
if log == 'x' or log is True:
if xmax < np.finfo('float').eps:
raise ValueError("Expected positive data for log = {}. "
"Got max(data) = {:.2f}".format(log, xmax))
elif xmin < np.finfo('float').eps:
warnings.warn("Expected positive data for log = {}. "
"Got min(data) = {:.2f}".format(log, xmin), UserWarning)
xmin = np.finfo('float').eps
xmin = np.log10(xmin)
xmax = np.log10(xmax)
xrange = max(xmax - xmin, 1)
xmin = xmin - xrange * 0.1
xmax = xmax + xrange * 0.1
bins = np.logspace(xmin,
xmax,
bins)
d_flat = np.concatenate(data) if isinstance(data, list) else data
abs_min = np.min(np.where(d_flat != 0, np.abs(d_flat), np.max(np.abs(d_flat))))
if abs_min == 0:
abs_min = 0.1
bins = _symlog_bins(xmin, xmax, abs_min, bins=bins)
ax.hist(data, bins=bins, histtype=histtype, alpha=alpha, **kwargs)

if log == 'x' or log is True:
ax.set_xscale('log')
ax.set_xscale('symlog', linthreshx=abs_min)
if log == 'y' or log is True:
ax.set_yscale('log')

Expand Down
2 changes: 1 addition & 1 deletion scprep/plot/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def generate_legend(cmap, ax, title=None, marker='o', markersize=10,
markersize=markersize)
for label, color in cmap.items()]
if ncol is None:
ncol = max(1, len(cmap) // max_rows)
ncol = max(1, np.ceil(len(cmap) / max_rows).astype(int))
legend = ax.legend(handles=handles, title=title,
loc=loc, bbox_to_anchor=bbox_to_anchor,
fontsize=fontsize, ncol=ncol, **kwargs)
Expand Down
Loading

0 comments on commit e520220

Please sign in to comment.