From 1fcef5adad2c62be6e61b0e63f430d377c0cd23a Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 17:46:17 -0400 Subject: [PATCH 1/3] add differential expression functionality --- scprep/stats.py | 154 ++++++++++++++++++++++++++++++++++++++++++++- scprep/utils.py | 63 +++++++++++++------ test/test_stats.py | 72 +++++++++++++++++++++ 3 files changed, 267 insertions(+), 22 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index 263fb01f..f60ab922 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -3,9 +3,11 @@ import numbers import numpy as np +import pandas as pd from scipy import stats, sparse from sklearn import neighbors, metrics -from . import plot, utils +import joblib +from . import plot, utils, select import warnings from ._lazyload import matplotlib @@ -363,6 +365,154 @@ def plot_knnDREMI(dremi, mutual_info, x, y, n_bins, n_mesh, plot.utils.show(fig) +def mean_difference(X, Y): + """Calculate the mean difference in genes between two datasets + + In the case where the data has been log normalized, + this is equivalent to fold change. + + Parameters + ---------- + X : array-like, shape=[n_cells, n_genes] + Y : array-like, shape=[m_cells, n_genes] + + Returns + ------- + difference : list-like, shape=[n_genes] + """ + if not X.shape[1] == Y.shape[1]: + raise ValueError("Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, Y.shape)) + X = utils.to_array_or_spmatrix(X) + Y = utils.to_array_or_spmatrix(Y) + X = utils.toarray(X.mean(axis=0)).flatten() + Y = utils.toarray(Y.mean(axis=0)).flatten() + return X - Y + + +def differential_expression(X, Y, + test='difference', + direction='up', + gene_names=None, + n_jobs=-2): + """Calculate the most significant genes between two datasets + + Parameters + ---------- + X : array-like, shape=[n_cells, n_genes] + Y : array-like, shape=[m_cells, n_genes] + test : {'difference', 'emd'}, optional (default: 'difference') + The statistical test to be used to rank genes + direction : {'up', 'down', 'both'}, optional (default: 'up') + The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. + gene_names : list-like or `None`, optional (default: `None`) + List of gene names associated with the columns of X and Y + n_jobs : int, optional (default: -2) + Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + + Returns + ------- + result : pd.DataFrame + Ordered DataFrame with a column "gene" and a column named `test`. + """ + if not direction in ['up', 'down', 'both']: + raise ValueError("Expected `direction` in ['up', 'down', 'both']. " + "Got {}".format(test)) + if not test in ['difference', 'emd']: + raise ValueError("Expected `test` in ['difference', 'emd']. " + "Got {}".format(test)) + if not (len(X.shape) == 2 and len(Y.shape) == 2): + raise ValueError("Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, Y.shape)) + [X, Y] = utils.check_consistent_columns([X, Y]) + if gene_names is not None: + if isinstance(X, pd.DataFrame): + X = select.select_cols(X, idx=gene_names) + gene_names = X.columns + if isinstance(Y, pd.DataFrame): + Y = select.select_cols(Y, idx=gene_names) + gene_names = Y.columns + if not len(gene_names) == X.shape[1]: + raise ValueError("Expected gene_names to have length {}. " + "Got {}".format(X.shape[1], len(gene_names))) + else: + if isinstance(X, pd.DataFrame) and isinstance(Y, pd.DataFrame): + gene_names = X.columns + else: + gene_names = np.arange(X.shape[1]) + X = utils.to_array_or_spmatrix(X) + Y = utils.to_array_or_spmatrix(Y) + # inconsistent behaviour from csr and csc + if sparse.issparse(X): + X = X.tocsr() + if sparse.issparse(Y): + Y = Y.tocsr() + if test == 'difference': + difference = mean_difference(X, Y) + elif test == 'emd': + difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)( + select.select_cols(X, idx=i), + select.select_cols(Y, idx=i)) + for i in range(X.shape[1])) + difference = np.array(difference) * np.sign(mean_difference(X, Y)) + result = pd.DataFrame({'gene' : gene_names, test : difference}) + if direction == 'up': + result = result.sort_values([test, 'gene'], ascending=False) + elif direction == 'down': + result = result.sort_values([test, 'gene'], ascending=True) + elif direction == 'both': + result['test_abs'] = np.abs(difference) + result = result.sort_values(['test_abs', 'gene'], ascending=False) + del result['test_abs'] + result.index = np.arange(result.shape[0]) + return result + + +def differential_expression_by_cluster(data, clusters, + test='difference', + direction='up', + gene_names=None, + n_jobs=-2): + """Calculate the most significant genes for each cluster in a dataset + + Tests are run for each cluster against the rest of the dataset. + + Parameters + ---------- + data : array-like, shape=[n_cells, n_genes] + clusters : list-like, shape=[n_cells] + test : {'difference', 'emd'}, optional (default: 'difference') + The statistical test to be used to rank genes + direction : {'up', 'down', 'both'}, optional (default: 'up') + The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. + gene_names : list-like or `None`, optional (default: `None`) + List of gene names associated with the columns of X and Y + n_jobs : int, optional (default: -2) + Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + + Returns + ------- + result : dict(pd.DataFrame) + Dictionary containing an ordered DataFrame with a column "gene" and a column named `test` for each cluster. + """ + if gene_names is not None and isinstance(data, pd.DataFrame): + data = select.select_cols(data, idx=gene_names) + gene_names = data.columns + if gene_names is None: + if isinstance(data, pd.DataFrame): + gene_names = data.columns + elif not len(gene_names) == data.shape[1]: + raise ValueError("Expected gene_names to have length {}. " + "Got {}".format(data.shape[1], len(gene_names))) + data = utils.to_array_or_spmatrix(data) + result = {cluster : differential_expression( + select.select_rows(data, idx=clusters==cluster), + select.select_rows(data, idx=clusters!=cluster), + test = test, direction = direction, + gene_names = gene_names, n_jobs = n_jobs) + for cluster in np.unique(clusters)} + return result + def _vector_coerce_dense(x): x = utils.toarray(x) x_1d = x.flatten() @@ -381,5 +531,5 @@ def _vector_coerce_two_dense(x, y): raise ValueError("Expected x and y to be 1D arrays. " "Got shapes x {}, y {}".format(x.shape, y.shape)) else: - raise + raise e return x, y diff --git a/scprep/utils.py b/scprep/utils.py index bf02c252..b7c247cc 100644 --- a/scprep/utils.py +++ b/scprep/utils.py @@ -364,6 +364,48 @@ def matrix_any(condition): return np.sum(np.sum(condition)) > 0 +def check_consistent_columns(data): + """Ensure that a set of data matrices have consistent columns + + Parameters + ---------- + data : list of array-likes + List of matrices to be checked + + Returns + ------- + data : list of array-likes + List of matrices with consistent columns, subsetted if necessary + + Raises + ------ + ValueError + Raised if data has inconsistent number of columns and does not + have column names for subsetting + """ + matrix_type = type(data[0]) + matrix_shape = data[0].shape[1] + if issubclass(matrix_type, pd.DataFrame): + if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and + np.all([data[0].columns == d.columns for d in data])): + common_genes = data[0].columns.values + for d in data[1:]: + common_genes = common_genes[np.isin(common_genes, + d.columns.values)] + for i in range(len(data)): + data[i] = data[i][common_genes] + warnings.warn("Input data has inconsistent column names. " + "Subsetting to {} common columns.".format( + len(common_genes)), UserWarning) + else: + for d in data[1:]: + if not d.shape[1] == matrix_shape: + shapes = ", ".join([str(d.shape[1]) for d in data]) + raise ValueError("Expected data all with the same number of " + "columns. Got {}".format(shapes)) + return data + + def combine_batches(data, batch_labels, append_to_cell_names=None): """Combine data matrices from multiple batches and store a batch label @@ -405,26 +447,7 @@ def combine_batches(data, batch_labels, append_to_cell_names=None): raise TypeError("Expected data all of the same class. " "Got {}".format(types)) - # check consistent columns - matrix_shape = data[0].shape[1] - if issubclass(matrix_type, pd.DataFrame): - if not (np.all([d.shape[1] == matrix_shape for d in data[1:]]) and - np.all([data[0].columns == d.columns for d in data])): - common_genes = data[0].columns.values - for d in data[1:]: - common_genes = common_genes[np.isin(common_genes, - d.columns.values)] - for i in range(len(data)): - data[i] = data[i][common_genes] - warnings.warn("Input data has inconsistent column names. " - "Subsetting to {} common columns.".format( - len(common_genes)), UserWarning) - else: - for d in data[1:]: - if not d.shape[1] == matrix_shape: - shapes = ", ".join([str(d.shape[1]) for d in data]) - raise ValueError("Expected data all with the same number of " - "columns. Got {}".format(shapes)) + data = check_consistent_columns(data) # check append_to_cell_names if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame): diff --git a/test/test_stats.py b/test/test_stats.py index fdb7eff1..88927702 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -6,6 +6,7 @@ import scprep from functools import partial import warnings +from parameterized import parameterized def _test_fun_2d(X, fun, **kwargs): @@ -121,3 +122,74 @@ def test_knnDREMI(): "Attempting to calculate kNN-DREMI on a constant array. " "Returning `0`", scprep.stats.knnDREMI, X[:, 0], np.zeros_like(X[:, 1])) + + +def test_mean_difference(): + X = data.load_10X() + X = scprep.filter.filter_empty_genes(X) + Y = scprep.stats.mean_difference(X.iloc[:20], X.iloc[20:100]) + assert np.allclose(np.max(Y), 16.8125) + assert np.allclose(np.min(Y), -0.5625) + def test_fun(X, **kwargs): + return scprep.stats.mean_difference( + scprep.select.select_rows(X, idx=np.arange(20)), + scprep.select.select_rows(X, idx=np.arange(20, 100)), + **kwargs) + matrix.test_all_matrix_types( + X, utils.assert_transform_equals, Y=Y, + transform=test_fun, + check=utils.assert_all_close) + + +@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), + ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) +def test_differential_expression(test, direction): + X = data.load_10X() + X = scprep.filter.filter_empty_genes(X) + result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100], + test=test, direction=direction) + expected_results = {('difference', 'up') : ('Gstm5', 16.8125), + ('difference', 'down') : ('Slc2a3', -0.5625), + ('difference', 'both') : ('Gstm5', 16.8125), + ('emd', 'up') : ('Gstm5', 17.5625), + ('emd', 'down') : ('Slc2a3', -0.6875), + ('emd', 'both') : ('Gstm5', 17.5625)} + assert result['gene'][0] == expected_results[(test, direction)][0], result['gene'][0] + assert np.allclose(result[test][0], + expected_results[(test, direction)][1]) + def test_fun(X, **kwargs): + return scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=np.arange(20)), + scprep.select.select_rows(X, idx=np.arange(20, 100)), + **kwargs) + def check_fun(Y1, Y2): + if direction == 'both': + Y1[test] = np.abs(Y1[test]) + Y2[test] = np.abs(Y2[test]) + np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + Y1 = Y1.sort_values('gene') + Y2 = Y2.sort_values('gene') + np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + matrix.test_all_matrix_types( + X, utils.assert_transform_equals, Y=result, + transform=test_fun, + check=check_fun, + gene_names=X.columns, + test=test, direction=direction) + + +@parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), + ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) +def test_differential_expression_by_cluster(test, direction): + X = data.load_10X() + np.random.seed(42) + clusters = np.random.choice(4, X.shape[0], replace=True) + result = scprep.stats.differential_expression_by_cluster( + X, clusters, + test=test, direction=direction) + for cluster in range(4): + r = scprep.stats.differential_expression( + scprep.select.select_rows(X, idx=clusters==cluster), + scprep.select.select_rows(X, idx=clusters!=cluster), + test=test, direction=direction) + assert np.all(result[cluster] == r) From 90528466efaffdb2f642350d5ea764864e8969a3 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 18:07:57 -0400 Subject: [PATCH 2/3] rename to and add tests --- scprep/stats.py | 48 +++++++++++++++++----------------- test/test_stats.py | 65 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/scprep/stats.py b/scprep/stats.py index f60ab922..574c2e8a 100644 --- a/scprep/stats.py +++ b/scprep/stats.py @@ -391,7 +391,7 @@ def mean_difference(X, Y): def differential_expression(X, Y, - test='difference', + measure='difference', direction='up', gene_names=None, n_jobs=-2): @@ -401,26 +401,26 @@ def differential_expression(X, Y, ---------- X : array-like, shape=[n_cells, n_genes] Y : array-like, shape=[m_cells, n_genes] - test : {'difference', 'emd'}, optional (default: 'difference') - The statistical test to be used to rank genes + measure : {'difference', 'emd'}, optional (default: 'difference') + The measurement to be used to rank genes direction : {'up', 'down', 'both'}, optional (default: 'up') The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. gene_names : list-like or `None`, optional (default: `None`) List of gene names associated with the columns of X and Y n_jobs : int, optional (default: -2) - Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. Returns ------- result : pd.DataFrame - Ordered DataFrame with a column "gene" and a column named `test`. + Ordered DataFrame with a column "gene" and a column named `measure`. """ if not direction in ['up', 'down', 'both']: raise ValueError("Expected `direction` in ['up', 'down', 'both']. " - "Got {}".format(test)) - if not test in ['difference', 'emd']: - raise ValueError("Expected `test` in ['difference', 'emd']. " - "Got {}".format(test)) + "Got {}".format(direction)) + if not measure in ['difference', 'emd']: + raise ValueError("Expected `measure` in ['difference', 'emd']. " + "Got {}".format(measure)) if not (len(X.shape) == 2 and len(Y.shape) == 2): raise ValueError("Expected `X` and `Y` to be matrices. " "Got shapes {}, {}".format(X.shape, Y.shape)) @@ -447,53 +447,53 @@ def differential_expression(X, Y, X = X.tocsr() if sparse.issparse(Y): Y = Y.tocsr() - if test == 'difference': + if measure == 'difference': difference = mean_difference(X, Y) - elif test == 'emd': + elif measure == 'emd': difference = joblib.Parallel(n_jobs)(joblib.delayed(EMD)( select.select_cols(X, idx=i), select.select_cols(Y, idx=i)) for i in range(X.shape[1])) difference = np.array(difference) * np.sign(mean_difference(X, Y)) - result = pd.DataFrame({'gene' : gene_names, test : difference}) + result = pd.DataFrame({'gene' : gene_names, measure : difference}) if direction == 'up': - result = result.sort_values([test, 'gene'], ascending=False) + result = result.sort_values([measure, 'gene'], ascending=False) elif direction == 'down': - result = result.sort_values([test, 'gene'], ascending=True) + result = result.sort_values([measure, 'gene'], ascending=True) elif direction == 'both': - result['test_abs'] = np.abs(difference) - result = result.sort_values(['test_abs', 'gene'], ascending=False) - del result['test_abs'] + result['measure_abs'] = np.abs(difference) + result = result.sort_values(['measure_abs', 'gene'], ascending=False) + del result['measure_abs'] result.index = np.arange(result.shape[0]) return result def differential_expression_by_cluster(data, clusters, - test='difference', + measure='difference', direction='up', gene_names=None, n_jobs=-2): """Calculate the most significant genes for each cluster in a dataset - Tests are run for each cluster against the rest of the dataset. + Measurements are run for each cluster against the rest of the dataset. Parameters ---------- data : array-like, shape=[n_cells, n_genes] clusters : list-like, shape=[n_cells] - test : {'difference', 'emd'}, optional (default: 'difference') - The statistical test to be used to rank genes + measure : {'difference', 'emd'}, optional (default: 'difference') + The measurement to be used to rank genes direction : {'up', 'down', 'both'}, optional (default: 'up') The direction in which to consider genes significant. If 'up', rank genes where X > Y. If 'down', rank genes where X < Y. If 'both', rank genes by absolute value. gene_names : list-like or `None`, optional (default: `None`) List of gene names associated with the columns of X and Y n_jobs : int, optional (default: -2) - Number of threads to use if the test is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. + Number of threads to use if the measurement is parallelizable (currently used for EMD). If negative, -1 refers to all available cores. Returns ------- result : dict(pd.DataFrame) - Dictionary containing an ordered DataFrame with a column "gene" and a column named `test` for each cluster. + Dictionary containing an ordered DataFrame with a column "gene" and a column named `measure` for each cluster. """ if gene_names is not None and isinstance(data, pd.DataFrame): data = select.select_cols(data, idx=gene_names) @@ -508,7 +508,7 @@ def differential_expression_by_cluster(data, clusters, result = {cluster : differential_expression( select.select_rows(data, idx=clusters==cluster), select.select_rows(data, idx=clusters!=cluster), - test = test, direction = direction, + measure = measure, direction = direction, gene_names = gene_names, n_jobs = n_jobs) for cluster in np.unique(clusters)} return result diff --git a/test/test_stats.py b/test/test_stats.py index 88927702..cafc914b 100644 --- a/test/test_stats.py +++ b/test/test_stats.py @@ -139,24 +139,30 @@ def test_fun(X, **kwargs): X, utils.assert_transform_equals, Y=Y, transform=test_fun, check=utils.assert_all_close) + assert_raise_message( + ValueError, + "Expected X and Y to have the same number of columns. " + "Got shapes {}, {}".format(X.shape, X.iloc[:,:10].shape), + scprep.stats.mean_difference, + X, X.iloc[:,:10]) @parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) -def test_differential_expression(test, direction): +def test_differential_expression(measure, direction): X = data.load_10X() X = scprep.filter.filter_empty_genes(X) result = scprep.stats.differential_expression(X.iloc[:20], X.iloc[20:100], - test=test, direction=direction) + measure=measure, direction=direction) expected_results = {('difference', 'up') : ('Gstm5', 16.8125), ('difference', 'down') : ('Slc2a3', -0.5625), ('difference', 'both') : ('Gstm5', 16.8125), ('emd', 'up') : ('Gstm5', 17.5625), ('emd', 'down') : ('Slc2a3', -0.6875), ('emd', 'both') : ('Gstm5', 17.5625)} - assert result['gene'][0] == expected_results[(test, direction)][0], result['gene'][0] - assert np.allclose(result[test][0], - expected_results[(test, direction)][1]) + assert result['gene'][0] == expected_results[(measure, direction)][0], result['gene'][0] + assert np.allclose(result[measure][0], + expected_results[(measure, direction)][1]) def test_fun(X, **kwargs): return scprep.stats.differential_expression( scprep.select.select_rows(X, idx=np.arange(20)), @@ -164,32 +170,65 @@ def test_fun(X, **kwargs): **kwargs) def check_fun(Y1, Y2): if direction == 'both': - Y1[test] = np.abs(Y1[test]) - Y2[test] = np.abs(Y2[test]) - np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + Y1[measure] = np.abs(Y1[measure]) + Y2[measure] = np.abs(Y2[measure]) + np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) Y1 = Y1.sort_values('gene') Y2 = Y2.sort_values('gene') - np.testing.assert_allclose(Y1[test], Y2[test], atol=5e-4) + np.testing.assert_allclose(Y1[measure], Y2[measure], atol=5e-4) matrix.test_all_matrix_types( X, utils.assert_transform_equals, Y=result, transform=test_fun, check=check_fun, gene_names=X.columns, - test=test, direction=direction) + measure=measure, direction=direction) + + +def test_differential_expression_error(): + X = data.load_10X() + assert_raise_message( + ValueError, "Expected `direction` in ['up', 'down', 'both']. " + "Got invalid", scprep.stats.differential_expression, + X, X, direction='invalid') + assert_raise_message( + ValueError, "Expected `measure` in ['difference', 'emd']. " + "Got invalid", scprep.stats.differential_expression, + X, X, measure='invalid') + assert_raise_message( + ValueError, "Expected `X` and `Y` to be matrices. " + "Got shapes {}, {}".format(X.shape, X.iloc[0].shape), + scprep.stats.differential_expression, + X, X.iloc[0]) + assert_raise_message( + ValueError, "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0]//2), + scprep.stats.differential_expression, + X.to_coo(), X.to_coo(), gene_names=np.arange(X.shape[0]//2)) + assert_raise_message( + ValueError, "Expected gene_names to have length {}. " + "Got {}".format(X.shape[0], X.shape[0]//2), + scprep.stats.differential_expression_by_cluster, + X.to_coo(), np.random.choice(2, X.shape[0], replace=True), + gene_names=np.arange(X.shape[0]//2)) + assert_warns_message( + UserWarning, "Input data has inconsistent column names. " + "Subsetting to 20 common columns.", + scprep.stats.differential_expression, + X, X.iloc[:,:20]) @parameterized([('difference', 'up'), ('difference', 'down'), ('difference', 'both'), ('emd', 'up'), ('emd', 'down'), ('emd', 'both')]) -def test_differential_expression_by_cluster(test, direction): +def test_differential_expression_by_cluster(measure, direction): X = data.load_10X() np.random.seed(42) clusters = np.random.choice(4, X.shape[0], replace=True) result = scprep.stats.differential_expression_by_cluster( X, clusters, - test=test, direction=direction) + measure=measure, direction=direction) for cluster in range(4): r = scprep.stats.differential_expression( scprep.select.select_rows(X, idx=clusters==cluster), scprep.select.select_rows(X, idx=clusters!=cluster), - test=test, direction=direction) + measure=measure, direction=direction) assert np.all(result[cluster] == r) From 0be7b6cede414ee7d2f902e8384c62e839a783be Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Sun, 15 Sep 2019 19:02:36 -0400 Subject: [PATCH 3/3] require parameterized --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3871b98b..a4d820d6 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ 'h5py', 'rpy2>=3.0', 'coverage', - 'coveralls' + 'coveralls', + 'parameterized', ] doc_requires = [