Skip to content

Commit

Permalink
Merge 0be7b6c into a6aad79
Browse files Browse the repository at this point in the history
  • Loading branch information
scottgigante committed Sep 15, 2019
2 parents a6aad79 + 0be7b6c commit 07b8c8d
Show file tree
Hide file tree
Showing 4 changed files with 308 additions and 23 deletions.
154 changes: 152 additions & 2 deletions scprep/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@

import numbers
import numpy as np
import pandas as pd
from scipy import stats, sparse
from sklearn import neighbors, metrics
from . import plot, utils
import joblib
from . import plot, utils, select
import warnings

from ._lazyload import matplotlib
Expand Down Expand Up @@ -363,6 +365,154 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh,
plot.utils.show(fig)


def mean_difference(X, Y):
"""Calculate the mean difference in genes between two datasets
In the case where the data has been log normalized,
this is equivalent to fold change.
Parameters
----------
X : array-like, shape=[n_cells, n_genes]
Y : array-like, shape=[m_cells, n_genes]
Returns
-------
difference : list-like, shape=[n_genes]
"""
if not X.shape[1] == Y.shape[1]:
raise ValueError("Expected X and Y to have the same number of columns. "
"Got shapes {}, {}".format(X.shape, Y.shape))
X = utils.to_array_or_spmatrix(X)
Y = utils.to_array_or_spmatrix(Y)
X = utils.toarray(X.mean(axis=0)).flatten()
Y = utils.toarray(Y.mean(axis=0)).flatten()
return X - Y


def differential_expression(X, Y,
measure='difference',
direction='up',
gene_names=None,
n_jobs=-2):
"""Calculate the most significant genes between two datasets
Parameters
----------
X : array-like, shape=[n_cells, n_genes]
Y : array-like, shape=[m_cells, n_genes]
measure : {'difference', 'emd'}, optional (default: 'difference')
The measurement to be used to rank genes
direction : {'up', 'down', 'both'}, optional (default: 'up')
The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value.
gene_names : list-like or `None`, optional (default: `None`)
List of gene names associated with the columns of X and Y
n_jobs : int, optional (default: -2)
Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores.
Returns
-------
result : pd.DataFrame
Ordered DataFrame with a column "gene" and a column named `measure`.
"""
if not direction in ['up', 'down', 'both']:
raise ValueError("Expected `direction` in ['up', 'down', 'both']. "
"Got {}".format(direction))
if not measure in ['difference', 'emd']:
raise ValueError("Expected `measure` in ['difference', 'emd']. "
"Got {}".format(measure))
if not (len(X.shape) == 2 and len(Y.shape) == 2):
raise ValueError("Expected `X` and `Y` to be matrices. "
"Got shapes {}, {}".format(X.shape, Y.shape))
[X, Y] = utils.check_consistent_columns([X, Y])
if gene_names is not None:
if isinstance(X, pd.DataFrame):
X = select.select_cols(X, idx=gene_names)
gene_names = X.columns
if isinstance(Y, pd.DataFrame):
Y = select.select_cols(Y, idx=gene_names)
gene_names = Y.columns
if not len(gene_names) == X.shape[1]:
raise ValueError("Expected gene_names to have length {}. "
"Got {}".format(X.shape[1], len(gene_names)))
else:
if isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame):
gene_names = X.columns
else:
gene_names = np.arange(X.shape[1])
X = utils.to_array_or_spmatrix(X)
Y = utils.to_array_or_spmatrix(Y)
# inconsistent behaviour from csr and csc
if sparse.issparse(X):
X = X.tocsr()
if sparse.issparse(Y):
Y = Y.tocsr()
if measure == 'difference':
difference = mean_difference(X, Y)
elif measure == 'emd':
difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)(
select.select_cols(X, idx=i),
select.select_cols(Y, idx=i))
for i in range(X.shape[1]))
difference = np.array(difference) * np.sign(mean_difference(X, Y))
result = pd.DataFrame({'gene' : gene_names, measure : difference})
if direction == 'up':
result = result.sort_values([measure, 'gene'], ascending=False)
elif direction == 'down':
result = result.sort_values([measure, 'gene'], ascending=True)
elif direction == 'both':
result['measure_abs'] = np.abs(difference)
result = result.sort_values(['measure_abs', 'gene'], ascending=False)
del result['measure_abs']
result.index = np.arange(result.shape[0])
return result


def differential_expression_by_cluster(data, clusters,
measure='difference',
direction='up',
gene_names=None,
n_jobs=-2):
"""Calculate the most significant genes for each cluster in a dataset
Measurements are run for each cluster against the rest of the dataset.
Parameters
----------
data : array-like, shape=[n_cells, n_genes]
clusters : list-like, shape=[n_cells]
measure : {'difference', 'emd'}, optional (default: 'difference')
The measurement to be used to rank genes
direction : {'up', 'down', 'both'}, optional (default: 'up')
The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value.
gene_names : list-like or `None`, optional (default: `None`)
List of gene names associated with the columns of X and Y
n_jobs : int, optional (default: -2)
Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores.
Returns
-------
result : dict(pd.DataFrame)
Dictionary containing an ordered DataFrame with a column "gene" and a column named `measure` for each cluster.
"""
if gene_names is not None and isinstance(data, pd.DataFrame):
data = select.select_cols(data, idx=gene_names)
gene_names = data.columns
if gene_names is None:
if isinstance(data, pd.DataFrame):
gene_names = data.columns
elif not len(gene_names) == data.shape[1]:
raise ValueError("Expected gene_names to have length {}. "
"Got {}".format(data.shape[1], len(gene_names)))
data = utils.to_array_or_spmatrix(data)
result = {cluster : differential_expression(
select.select_rows(data, idx=clusters==cluster),
select.select_rows(data, idx=clusters!=cluster),
measure = measure, direction = direction,
gene_names = gene_names, n_jobs = n_jobs)
for cluster in np.unique(clusters)}
return result

def _vector_coerce_dense(x):
x = utils.toarray(x)
x_1d = x.flatten()
Expand All @@ -381,5 +531,5 @@ def _vector_coerce_two_dense(x, y):
raise ValueError("Expected x and y to be 1D arrays. "
"Got shapes x {}, y {}".format(x.shape, y.shape))
else:
raise
raise e
return x, y
63 changes: 43 additions & 20 deletions scprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,48 @@ def matrix_any(condition):
return np.sum(np.sum(condition)) > 0


def check_consistent_columns(data):
"""Ensure that a set of data matrices have consistent columns
Parameters
----------
data : list of array-likes
List of matrices to be checked
Returns
-------
data : list of array-likes
List of matrices with consistent columns, subsetted if necessary
Raises
------
ValueError
Raised if data has inconsistent number of columns and does not
have column names for subsetting
"""
matrix_type = type(data[0])
matrix_shape = data[0].shape[1]
if issubclass(matrix_type, pd.DataFrame):
if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and
np.all([data[0].columns == d.columns for d in data])):
common_genes = data[0].columns.values
for d in data[1:]:
common_genes = common_genes[np.isin(common_genes,
d.columns.values)]
for i in range(len(data)):
data[i] = data[i][common_genes]
warnings.warn("Input data has inconsistent column names. "
"Subsetting to {} common columns.".format(
len(common_genes)), UserWarning)
else:
for d in data[1:]:
if not d.shape[1] == matrix_shape:
shapes = ", ".join([str(d.shape[1]) for d in data])
raise ValueError("Expected data all with the same number of "
"columns. Got {}".format(shapes))
return data


def combine_batches(data, batch_labels, append_to_cell_names=None):
"""Combine data matrices from multiple batches and store a batch label
Expand Down Expand Up @@ -405,26 +447,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=None):
raise TypeError("Expected data all of the same class. "
"Got {}".format(types))

# check consistent columns
matrix_shape = data[0].shape[1]
if issubclass(matrix_type, pd.DataFrame):
if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and
np.all([data[0].columns == d.columns for d in data])):
common_genes = data[0].columns.values
for d in data[1:]:
common_genes = common_genes[np.isin(common_genes,
d.columns.values)]
for i in range(len(data)):
data[i] = data[i][common_genes]
warnings.warn("Input data has inconsistent column names. "
"Subsetting to {} common columns.".format(
len(common_genes)), UserWarning)
else:
for d in data[1:]:
if not d.shape[1] == matrix_shape:
shapes = ", ".join([str(d.shape[1]) for d in data])
raise ValueError("Expected data all with the same number of "
"columns. Got {}".format(shapes))
data = check_consistent_columns(data)

# check append_to_cell_names
if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame):
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
'h5py',
'rpy2>=3.0',
'coverage',
'coveralls'
'coveralls',
'parameterized',
]

doc_requires = [
Expand Down
111 changes: 111 additions & 0 deletions test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import scprep
from functools import partial
import warnings
from parameterized import parameterized


def _test_fun_2d(X, fun, **kwargs):
Expand Down Expand Up @@ -121,3 +122,113 @@ def test_knnDREMI():
"Attempting to calculate kNN-DREMI on a constant array. "
"Returning `0`", scprep.stats.knnDREMI, X[:, 0],
np.zeros_like(X[:, 1]))


def test_mean_difference():
X = data.load_10X()
X = scprep.filter.filter_empty_genes(X)
Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100])
assert np.allclose(np.max(Y), 16.8125)
assert np.allclose(np.min(Y), -0.5625)
def test_fun(X, **kwargs):
return scprep.stats.mean_difference(
scprep.select.select_rows(X, idx=np.arange(20)),
scprep.select.select_rows(X, idx=np.arange(20, 100)),
**kwargs)
matrix.test_all_matrix_types(
X, utils.assert_transform_equals, Y=Y,
transform=test_fun,
check=utils.assert_all_close)
assert_raise_message(
ValueError,
"Expected X and Y to have the same number of columns. "
"Got shapes {}, {}".format(X.shape, X.iloc[:,:10].shape),
scprep.stats.mean_difference,
X, X.iloc[:,:10])


@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'),
('emd', 'up'), ('emd', 'down'), ('emd', 'both')])
def test_differential_expression(measure, direction):
X = data.load_10X()
X = scprep.filter.filter_empty_genes(X)
result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100],
measure=measure, direction=direction)
expected_results = {('difference', 'up') : ('Gstm5', 16.8125),
('difference', 'down') : ('Slc2a3', -0.5625),
('difference', 'both') : ('Gstm5', 16.8125),
('emd', 'up') : ('Gstm5', 17.5625),
('emd', 'down') : ('Slc2a3', -0.6875),
('emd', 'both') : ('Gstm5', 17.5625)}
assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0]
assert np.allclose(result[measure][0],
expected_results[(measure, direction)][1])
def test_fun(X, **kwargs):
return scprep.stats.differential_expression(
scprep.select.select_rows(X, idx=np.arange(20)),
scprep.select.select_rows(X, idx=np.arange(20, 100)),
**kwargs)
def check_fun(Y1, Y2):
if direction == 'both':
Y1[measure] = np.abs(Y1[measure])
Y2[measure] = np.abs(Y2[measure])
np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4)
Y1 = Y1.sort_values('gene')
Y2 = Y2.sort_values('gene')
np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4)
matrix.test_all_matrix_types(
X, utils.assert_transform_equals, Y=result,
transform=test_fun,
check=check_fun,
gene_names=X.columns,
measure=measure, direction=direction)


def test_differential_expression_error():
X = data.load_10X()
assert_raise_message(
ValueError, "Expected `direction` in ['up', 'down', 'both']. "
"Got invalid", scprep.stats.differential_expression,
X, X, direction='invalid')
assert_raise_message(
ValueError, "Expected `measure` in ['difference', 'emd']. "
"Got invalid", scprep.stats.differential_expression,
X, X, measure='invalid')
assert_raise_message(
ValueError, "Expected `X` and `Y` to be matrices. "
"Got shapes {}, {}".format(X.shape, X.iloc[0].shape),
scprep.stats.differential_expression,
X, X.iloc[0])
assert_raise_message(
ValueError, "Expected gene_names to have length {}. "
"Got {}".format(X.shape[0], X.shape[0]//2),
scprep.stats.differential_expression,
X.to_coo(), X.to_coo(), gene_names=np.arange(X.shape[0]//2))
assert_raise_message(
ValueError, "Expected gene_names to have length {}. "
"Got {}".format(X.shape[0], X.shape[0]//2),
scprep.stats.differential_expression_by_cluster,
X.to_coo(), np.random.choice(2, X.shape[0], replace=True),
gene_names=np.arange(X.shape[0]//2))
assert_warns_message(
UserWarning, "Input data has inconsistent column names. "
"Subsetting to 20 common columns.",
scprep.stats.differential_expression,
X, X.iloc[:,:20])


@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'),
('emd', 'up'), ('emd', 'down'), ('emd', 'both')])
def test_differential_expression_by_cluster(measure, direction):
X = data.load_10X()
np.random.seed(42)
clusters = np.random.choice(4, X.shape[0], replace=True)
result = scprep.stats.differential_expression_by_cluster(
X, clusters,
measure=measure, direction=direction)
for cluster in range(4):
r = scprep.stats.differential_expression(
scprep.select.select_rows(X, idx=clusters==cluster),
scprep.select.select_rows(X, idx=clusters!=cluster),
measure=measure, direction=direction)
assert np.all(result[cluster] == r)

0 comments on commit 07b8c8d

Please sign in to comment.