Skip to content

Commit

Permalink
Merge 1dec709 into 635eefe
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgigante committed Sep 28, 2019
2 parents 635eefe + 1dec709 commit 543c960
Show file tree
Hide file tree
Showing 13 changed files with 303 additions and 7 deletions.
3 changes: 2 additions & 1 deletion scprep/_lazyload.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
'rinterface',
{'rinterface_lib': ['callbacks']}],
'h5py': [],
'tables': []
'tables': [],
'statsmodels': [{'nonparametric': ['smoothers_lowess']}],
}


Expand Down
38 changes: 38 additions & 0 deletions scprep/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,3 +369,41 @@ def filter_duplicates(data, *extra_data, sample_labels=None):
unique_idx = _find_unique_cells(data)
data = select.select_rows(data, *extra_data, idx=unique_idx)
return data


def filter_variable_genes(data, *extra_data, span=0.7, interpolate=0.2,
cutoff=None, percentile=80):
"""Filter all genes with low variability
Variability is computed as the deviation from a loess fit of the mean-variance curve
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[any, n_features], optional
Optional additional data objects from which to select the same rows
span : float, optional (default: 0.7)
Fraction of genes to use when computing the loess estimate at each point
interpolate : float, optional (default: 0.2)
Multiple of the standard deviation of variances at which to interpolate
linearly in order to reduce computation time.
cutoff : float, optional (default: None)
Variability above which expression is deemed significant
percentile : int, optional (Default: 80)
Percentile above or below which to remove genes.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
Returns
-------
data : array-like, shape=[n_samples, m_features]
Filtered output data, where m_features <= n_features
extra_data : array-like, shape=[any, m_features]
Filtered extra data, if passed.
"""
var_genes = measure.variable_genes(data, span=span, interpolate=interpolate)
keep_cells_idx = _get_filter_idx(var_genes,
cutoff, percentile,
keep_cells='above')
return select.select_cols(data, *extra_data, idx=keep_cells_idx)
36 changes: 36 additions & 0 deletions scprep/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numbers

from . import utils, select
from ._lazyload import statsmodels


def library_size(data):
Expand Down Expand Up @@ -67,6 +68,41 @@ def gene_set_expression(data, genes=None, library_size_normalize=False,
return gene_set_expression


@utils._with_pkg(pkg="statsmodels")
def variable_genes(data, span=0.7, interpolate=0.2):
"""Measure the variability of each gene in a dataset
Variability is computed as the deviation from a loess fit of the mean-variance curve
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
span : float, optional (default: 0.7)
Fraction of genes to use when computing the loess estimate at each point
interpolate : float, optional (default: 0.2)
Multiple of the standard deviation of variances at which to interpolate
linearly in order to reduce computation time.
Returns
-------
variability : list-like, shape=[n_samples]
Variability for each gene
"""
columns = data.columns if isinstance(data, pd.DataFrame) else None
data = utils.to_array_or_spmatrix(data)
data_std = utils.matrix_std(data, axis=0) ** 2
data_mean = utils.toarray(np.mean(data, axis=0)).flatten()
delta = np.std(data_std) * interpolate
lowess = statsmodels.nonparametric.smoothers_lowess.lowess(
data_std, data_mean,
delta=delta, frac=span, return_sorted=False)
variability = data_std - lowess
if columns is not None:
variability = pd.Series(variability, index=columns, name='variability')
return variability


def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False):
"""Get a cutoff for a dataset
Expand Down
2 changes: 1 addition & 1 deletion scprep/plot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .scatter import scatter, scatter2d, scatter3d, rotate_scatter3d
from .histogram import histogram, plot_library_size, plot_gene_set_expression
from .histogram import histogram, plot_library_size, plot_gene_set_expression, plot_variable_genes
from .marker import marker_plot
from .scree import scree_plot
from .jitter import jitter
Expand Down
80 changes: 80 additions & 0 deletions scprep/plot/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,83 @@ def plot_gene_set_expression(data, genes=None,
bins=bins, log=log, ax=ax, figsize=figsize,
xlabel=xlabel, title=title, fontsize=fontsize,
filename=filename, dpi=dpi, **kwargs)


@utils._with_pkg(pkg="matplotlib", min_version=3)
def plot_variable_genes(data, span=0.7, interpolate=0.2,
bins=100, log=False,
cutoff=None, percentile=None,
ax=None, figsize=None,
xlabel='Gene variability',
ylabel='Number of genes',
title=None,
fontsize=None,
filename=None,
dpi=None, **kwargs):
"""Plot the histogram of gene variability
Variability is computed as the deviation from a loess fit of the mean-variance curve
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data. Multiple datasets may be given as a list of array-likes.
span : float, optional (default: 0.7)
Fraction of genes to use when computing the loess estimate at each point
interpolate : float, optional (default: 0.2)
Multiple of the standard deviation of variances at which to interpolate
linearly in order to reduce computation time.
bins : int, optional (default: 100)
Number of bins to draw in the histogram
log : bool, or {'x', 'y'}, optional (default: False)
If True, plot both axes on a log scale. If 'x' or 'y',
only plot the given axis on a log scale. If False,
plot both axes on a linear scale.
cutoff : float or `None`, optional (default: `None`)
Absolute cutoff at which to draw a vertical line.
Only one of `cutoff` and `percentile` may be given.
percentile : float or `None`, optional (default: `None`)
Percentile between 0 and 100 at which to draw a vertical line.
Only one of `cutoff` and `percentile` may be given.
library_size_normalize : bool, optional (default: False)
Divide gene set expression by library size
ax : `matplotlib.Axes` or None, optional (default: None)
Axis to plot on. If None, a new axis will be created.
figsize : tuple or None, optional (default: None)
If not None, sets the figure size (width, height)
[x,y]label : str, optional
Labels to display on the x and y axis.
title : str or None, optional (default: None)
Axis title.
fontsize : float or None (default: None)
Base font size.
filename : str or None (default: None)
file to which the output is saved
dpi : int or None, optional (default: None)
The resolution in dots per inch. If None it will default to the value
savefig.dpi in the matplotlibrc file. If 'figure' it will set the dpi
to be the value of the figure. Only used if filename is not None.
**kwargs : additional arguments for `matplotlib.pyplot.hist`
Returns
-------
ax : `matplotlib.Axes`
axis on which plot was drawn
"""
if hasattr(data, 'shape') and len(data.shape) == 2:
var_genes = measure.variable_genes(
data, span=span, interpolate=interpolate)
else:
data_array = utils.to_array_or_spmatrix(data)
if len(data_array.shape) == 2 and data_array.dtype.type is not np.object_:
var_genes = measure.variable_genes(
data_array, span=span, interpolate=interpolate)
else:
var_genes = [measure.variable_genes(
d, span=span, interpolate=interpolate)
for d in data]
return histogram(var_genes,
cutoff=cutoff, percentile=percentile,
bins=bins, log=log, ax=ax, figsize=figsize,
xlabel=xlabel, title=title, fontsize=fontsize,
filename=filename, dpi=dpi, **kwargs)
4 changes: 2 additions & 2 deletions scprep/plot/scree.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def scree_plot(singular_values, cumulative=False, ax=None, figsize=None,
if cumulative:
explained_variance = np.cumsum(explained_variance)
fig, ax, show_fig = _get_figure(ax, figsize)
ax.plot(np.arange(len(explained_variance)),
explained_variance, **kwargs)
ax.bar(np.arange(len(explained_variance)),
explained_variance, **kwargs)
label_axis(ax.xaxis, label=xlabel)
label_axis(ax.yaxis, label=ylabel)
if show_fig:
Expand Down
2 changes: 1 addition & 1 deletion scprep/plot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def show(fig):
fig : matplotlib.Figure
Figure to show
"""
fig.tight_layout()
if _mpl_is_gui_backend():
fig.tight_layout()
if platform.system() == "Windows":
plt.show(block=True)
else:
Expand Down
61 changes: 60 additions & 1 deletion scprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def toarray(x):
elif isinstance(x, sparse.spmatrix):
x = x.toarray()
elif isinstance(x, np.matrix):
x = np.array(x)
x = x.A
elif isinstance(x, list):
x_out = []
for xi in x:
Expand Down Expand Up @@ -224,6 +224,65 @@ def matrix_sum(data, axis=None):
return sums


def matrix_std(data, axis=None):
"""Get the column-wise, row-wise, or total standard deviation of a matrix
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
axis : int or None, optional (default: None)
Axis across which to calculate standard deviation.
axis=0 gives column standard deviation,
axis=1 gives row standard deviation.
None gives the total standard deviation.
Returns
-------
std : array-like or float
Standard deviation along desired axis.
"""
if axis not in [0, 1, None]:
raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
index = None
if isinstance(data, pd.DataFrame) and axis is not None:
if axis == 1:
index = data.index
elif axis == 0:
index = data.columns
data = to_array_or_spmatrix(data)
if sparse.issparse(data):
if axis is None:
if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
data = data.tocoo()
data_sq = data.copy()
data_sq.data = data_sq.data ** 2
variance = data_sq.mean() - data.mean() ** 2
std = np.sqrt(variance)
else:
if axis == 0:
data = data.tocsc()
next_fn = data.getcol
N = data.shape[1]
elif axis == 1:
data = data.tocsr()
next_fn = data.getrow
N = data.shape[0]
std = []
for i in range(N):
col = next_fn(i)
col_sq = col.copy()
col_sq.data = col_sq.data ** 2
variance = col_sq.mean() - col.mean() ** 2
std.append(np.sqrt(variance))
std = np.array(std)
else:
std = np.std(data, axis=axis)
if index is not None:
std = pd.Series(std, index=index, name='std')
return std


def matrix_vector_elementwise_multiply(data, multiplier, axis=None):
"""Elementwise multiply a matrix by a vector
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
'coverage',
'coveralls',
'parameterized',
'statsmodels',
]

doc_requires = [
Expand Down
11 changes: 11 additions & 0 deletions test/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,17 @@ def test_filter_rare_genes(self):
self.X_dense, utils.assert_transform_equals,
Y=X_filtered, transform=scprep.filter.filter_rare_genes)

def test_filter_variable_genes(self):
X_filtered = scprep.filter.filter_variable_genes(self.X_dense, percentile=70)
assert X_filtered.shape[0] == self.X_dense.shape[0]
assert X_filtered.shape[1] <= 30
assert X_filtered.shape[1] >= 20
assert self.X_dense.columns[np.argmax(self.X_dense.values.std(axis=0))] in X_filtered.columns
matrix.test_all_matrix_types(
self.X_dense, utils.assert_transform_equals,
Y=X_filtered, transform=scprep.filter.filter_variable_genes, percentile=70)


def test_library_size_filter(self):
X_filtered = scprep.filter.filter_library_size(
self.X_sparse, cutoff=100)
Expand Down
10 changes: 9 additions & 1 deletion test/test_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,18 @@ def test_fun(X):
matrix.test_pandas_matrix_types(
self.X_dense, test_fun)

def test_library_size(self):
def test_gene_set_expression(self):
def test_fun(X):
x = scprep.measure.gene_set_expression(X, genes=[0, 1])
assert x.name == 'expression'
assert np.all(x.index == self.X_dense.index)
matrix.test_pandas_matrix_types(
self.X_dense, test_fun)

def test_variable_genes(self):
def test_fun(X):
x = scprep.measure.variable_genes(X)
assert x.name == 'variability'
assert np.all(x.index == self.X_dense.columns)
matrix.test_pandas_matrix_types(
self.X_dense, test_fun)
18 changes: 18 additions & 0 deletions test/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,7 @@ def tearDownClass(self):
try_remove("test_jitter.png")
try_remove("test_histogram.png")
try_remove("test_library_size.png")
try_remove("test_variable_genes.png")
try_remove("test_gene_expression.png")

def tearDown(self):
Expand Down Expand Up @@ -799,6 +800,23 @@ def test_plot_gene_set_expression_single_gene(self):
filename="test_gene_expression.png")
assert os.path.exists("test_gene_expression.png")

def test_plot_variable_genes(self):
scprep.plot.plot_variable_genes(
self.X,
color='r')

def test_plot_variable_genes_multiple(self):
scprep.plot.plot_variable_genes([
self.X, scprep.select.select_rows(
self.X, idx=np.arange(self.X.shape[0] // 2))],
filename="test_variable_genes.png",
color=['r', 'b'])
assert os.path.exists("test_variable_genes.png")

def test_variable_genes_list_of_lists(self):
scprep.plot.plot_variable_genes(
scprep.utils.toarray(self.X).tolist())

def test_histogram_single_gene_dataframe(self):
scprep.plot.histogram(
scprep.select.select_cols(self.X, idx=['Arl8b']),
Expand Down

0 comments on commit 543c960

Please sign in to comment.