Skip to content

Commit

Permalink
Merge pull request #39 from KrishnaswamyLab/atong-hdf5-version-bump
Browse files Browse the repository at this point in the history
Add support for cellranger >= 3.0.2 10X h5 matrix format
  • Loading branch information
scottgigante committed Apr 14, 2019
2 parents 2981d59 + 6f728f5 commit 4417ae6
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 29 deletions.
Binary file added data/test_data/test_10X_cellranger3.h5
Binary file not shown.
1 change: 1 addition & 0 deletions scprep/io/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def _is_tables(obj, allow_file=True, allow_group=True, allow_dataset=True):
types.append(tables.Group)
if allow_dataset:
types.append(tables.CArray)
types.append(tables.Array)
except NameError:
return False
return isinstance(obj, tuple(types))
Expand Down
66 changes: 45 additions & 21 deletions scprep/io/tenx.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol',
genome : str or None, optional (default: None)
Name of the genome to which CellRanger ran analysis. If None, selects
the first available genome, and prints all available genomes if more
than one is available.
than one is available. Invalid for Cellranger 3.0 HDF5 files.
sparse: boolean
If True, a sparse Pandas DataFrame is returned.
gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
Expand All @@ -256,31 +256,55 @@ def load_10X_HDF5(filename, genome=None, sparse=True, gene_labels='symbol',
"gene_labels='{}' not recognized. "
"Choose from ['symbol', 'id', 'both']".format(gene_labels))

# default allow_duplicates
if allow_duplicates is None:
allow_duplicates = not sparse

with hdf5.open_file(filename, 'r', backend=backend) as f:
if genome is None:
genomes = hdf5.list_nodes(f)
print_genomes = ", ".join(genomes)
genome = genomes[0]
if len(genomes) > 1:
print("Available genomes: {}. Selecting {} by default".format(
print_genomes, genome))

# handle genome
groups = hdf5.list_nodes(f)
try:
group = hdf5.get_node(f, genome)
# Cellranger 3.0
group = hdf5.get_node(f, 'matrix')
if genome is not None:
raise NotImplementedError(
"Selecting genomes for Cellranger 3.0 files is not "
"currently supported. Please file an issue at "
"https://github.com/KrishnaswamyLab/scprep/issues")
except (AttributeError, KeyError):
genomes = hdf5.list_nodes(f)
print_genomes = ", ".join(genomes)
raise ValueError(
"Genome {} not found in {}. "
"Available genomes: {}".format(genome, filename,
print_genomes))
if allow_duplicates is None:
allow_duplicates = not sparse
# Cellranger 2.0
if genome is None:
print_genomes = ", ".join(groups)
genome = groups[0]
if len(groups) > 1:
print("Available genomes: {}. Selecting {} by default".format(
print_genomes, genome))
try:
group = hdf5.get_node(f, genome)
except (AttributeError, KeyError):
print_genomes = ", ".join(groups)
raise ValueError(
"Genome {} not found in {}. "
"Available genomes: {}".format(genome, filename,
print_genomes))

try:
# Cellranger 3.0
features = hdf5.get_node(group, 'features')
gene_symbols = hdf5.get_node(features, 'name')
gene_ids = hdf5.get_node(features, 'id')
except (KeyError, IndexError):
# Cellranger 2.0
gene_symbols = hdf5.get_node(group, 'gene_names')
gene_ids = hdf5.get_node(group, 'genes')

# convert to string column names
gene_names = _parse_10x_genes(
symbols=[g.decode() for g in hdf5.get_values(
hdf5.get_node(group, 'gene_names'))],
ids=[g.decode()
for g in hdf5.get_values(hdf5.get_node(group, 'genes'))],
symbols=[g.decode() for g in hdf5.get_values(gene_symbols)],
ids=[g.decode() for g in hdf5.get_values(gene_ids)],
gene_labels=gene_labels, allow_duplicates=allow_duplicates)

cell_names = [b.decode() for b in hdf5.get_values(
hdf5.get_node(group, 'barcodes'))]
data = hdf5.get_values(hdf5.get_node(group, 'data'))
Expand Down
66 changes: 58 additions & 8 deletions test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ def test_10X():
assert X.shape == (100, 100)
assert isinstance(X, pd.SparseDataFrame)
assert X.columns[0] == "Arl8b (ENSMUSG00000030105)"
X_generanger3 = scprep.io.load_10X(
os.path.join(data.data_dir, "test_10X_generanger3"),
X_cellranger3 = scprep.io.load_10X(
os.path.join(data.data_dir, "test_10X_cellranger3"),
gene_labels="both")
np.testing.assert_array_equal(X.index, X_generanger3.index)
np.testing.assert_array_equal(X.columns, X_generanger3.columns)
np.testing.assert_array_equal(X.index, X_generanger3.index)
np.testing.assert_array_equal(X.index, X_cellranger3.index)
np.testing.assert_array_equal(X.columns, X_cellranger3.columns)
np.testing.assert_array_equal(X.index, X_cellranger3.index)
assert_raise_message(
ValueError,
"gene_labels='invalid' not recognized. "
Expand Down Expand Up @@ -139,27 +139,65 @@ def test_10X_zip_not_a_file():

def test_10X_HDF5():
X = data.load_10X()
# tables backend
h5_file = os.path.join(data.data_dir, "test_10X.h5")
# automatic tables backend
X_hdf5 = scprep.io.load_10X_HDF5(h5_file)
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
# hdf5 backend
# explicit tables backend
X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables')
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
# explicit h5py backend
X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py')
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
# forced h5py backend
# automatic h5py backend
tables = scprep.io.hdf5.tables
del scprep.io.hdf5.tables
X_hdf5 = scprep.io.load_10X_HDF5(h5_file)
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
scprep.io.hdf5.tables = tables


def test_10X_HDF5_cellranger3():
X = data.load_10X()
h5_file = os.path.join(data.data_dir, "test_10X_cellranger3.h5")
# automatic tables backend
X_hdf5 = scprep.io.load_10X_HDF5(h5_file)
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
# explicit tables backend
X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='tables')
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
# explicit h5py backend
X_hdf5 = scprep.io.load_10X_HDF5(h5_file, backend='h5py')
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
# automatic h5py backend
tables = scprep.io.hdf5.tables
del scprep.io.hdf5.tables
X_hdf5 = scprep.io.load_10X_HDF5(h5_file)
assert isinstance(X_hdf5, pd.SparseDataFrame)
assert np.sum(np.sum(X != X_hdf5)) == 0
np.testing.assert_array_equal(X.columns, X_hdf5.columns)
np.testing.assert_array_equal(X.index, X_hdf5.index)
scprep.io.hdf5.tables = tables


Expand All @@ -174,6 +212,18 @@ def test_10X_HDF5_invalid_genome():
genome="invalid")


def test_10X_HDF5_genome_cellranger3():
h5_file = os.path.join(data.data_dir, "test_10X_cellranger3.h5")
assert_raise_message(
NotImplementedError,
"Selecting genomes for Cellranger 3.0 files is not "
"currently supported. Please file an issue at "
"https://github.com/KrishnaswamyLab/scprep/issues",
scprep.io.load_10X_HDF5,
filename=h5_file,
genome="GRCh38")


def test_10X_HDF5_invalid_backend():
h5_file = os.path.join(data.data_dir, "test_10X.h5")
assert_raise_message(
Expand Down

0 comments on commit 4417ae6

Please sign in to comment.