https://github.com/chanzuckerberg/single-cell-curation/issues/614

In [None]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse

In [None]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [None]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')

    if adata.raw:
        print("Datatype of raw.X matrix: {}".format(adata.raw.X.dtype))
        print("Format of raw.X matrix: {}".format(get_format(adata.raw.X)))
        non_integer = np.any(~np.equal(np.mod(adata.raw.X.data, 1), 0))
        if non_integer == False:
            print('raw.X is all integers')
        else:
            print('raw.X contains non-integer values')
    else:
        print("raw slot is absent")
    print('---------')

    print("Datatype of X matrix: {}".format(adata.X.dtype))
    print("Format of X matrix: {}".format(get_format(adata.X)))
    non_integer = np.any(~np.equal(np.mod(adata.X.data, 1), 0))
    if non_integer == False:
        print('X is all integers')
    else:
        print('X contains non-integer values')
    print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [None]:
def get_format(x):
    if sparse.issparse(x):
        return(x.getformat())
    elif isinstance(x, np.ndarray):
        return("numpy array")

## Test Valid Cases

In [None]:
# Raw counts in raw slot - csr
adata = sc.read_h5ad("../valid.h5ad")
save_and_test(adata, 'True')

In [None]:
# Raw counts in raw slot - csc
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = sparse.csc_matrix(raw.X)
adata.raw = raw
save_and_test(adata, 'True')

In [None]:
# Raw counts in raw slot - np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
adata.raw = raw
save_and_test(adata, 'True')

In [None]:
# Raw counts in .X - csr
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X
del adata.raw
save_and_test(adata, 'True')

In [None]:
# Raw counts in .X - csc
adata = sc.read_h5ad("../valid.h5ad")
adata.X = sparse.csc_matrix(raw.X)
del adata.raw
save_and_test(adata, 'True')

In [None]:
# Raw counts in .X - np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
del adata.raw
save_and_test(adata, 'True')

In [None]:
# .X is float64
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.X.astype('float64')
save_and_test(adata, 'True')

In [None]:
# .X is float16
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.X.astype('float16')
save_and_test(adata, 'True')

In [None]:
# .X is int with negatives
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.X.toarray()
adata.X[0:5] = -145
adata.X = sparse.csr_matrix(adata.X)
adata.X = adata.X.astype(int)
save_and_test(adata, 'True')

In [None]:
# .X has cells with all zeros
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.X.toarray()
adata.X[0:5] = 0
adata.X = sparse.csc_matrix(adata.X)
print("Nonzero expression values in adata.X for first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'True')

In [None]:
#non-RNA assay without raw counts
adata = sc.read_h5ad("../valid.h5ad")
adata.obs['assay_ontology_term_id'] = 'EFO:0007045' #ATAC-seq
del adata.raw
save_and_test(adata, 'True')

In [None]:
#non-RNA without raw counts but with raw.X
adata = sc.read_h5ad("../valid.h5ad")
adata.obs['assay_ontology_term_id'] = 'EFO:0007045' #ATAC-seq
adata.obs['suspension_type'] = 'nucleus'
non_raw = ad.AnnData(X=adata.X, obs=adata.obs, var=adata.var)
non_raw.var.drop(columns='feature_is_filtered', inplace=True)
adata.raw = non_raw
save_and_test(adata, 'True')

In [None]:
#non-RNA assay has cells with zeros
adata = sc.read_h5ad("../valid.h5ad")
adata.obs['assay_ontology_term_id'] = 'EFO:0007045' #ATAC-seq
adata.obs['suspension_type'] = 'nucleus'
adata.X = adata.X.toarray()
adata.X[0:5] = 0
adata.X = sparse.csr_matrix(adata.X)
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'True')

## Test Invalid Cases

In [None]:
# No raw counts
adata = sc.read_h5ad("../valid.h5ad")
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are float64 for csr
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype('float64')
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are float64 for csr
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype('float64')
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are float16 for csr
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype('float16')
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are float16 for csr
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype('float16')
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are integers for csr
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype(int)
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are integers for csr
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype(int)
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are float64 for csc
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype('float64')
raw.X = sparse.csc_matrix(raw.X)
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are float64 for csc
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype('float64')
adata.X = sparse.csc_matrix(adata.X)
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are float16 for csc
# Seems to convert to float32 at some point in the process

In [None]:
# The raw counts in adata.X are float16 for csc
# Seems to convert to float32 at some point in the process

In [None]:
# The raw counts in adata.raw.X are integers for csc
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype(int)
raw.X = sparse.csc_matrix(raw.X)
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are integers for csc
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype(int)
adata.X = sparse.csc_matrix(adata.X)
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are float64 for np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype('float64')
raw.X = raw.X.toarray()
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are float64 for np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype('float64')
adata.X = adata.X.toarray()
del adata.raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X are float16 for np.ndarray
# ValueError: Output dtype not compatible with inputs.

In [None]:
# The raw counts in adata.X are float16 for np.ndarray
# ValueError: Output dtype not compatible with inputs.

In [None]:
# The raw counts in adata.raw.X are integers for np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype(int)
raw.X = raw.X.toarray()
adata.raw = raw
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X are integers for np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype(int)
adata.X = adata.X.toarray()
del adata.raw
save_and_test(adata, 'False')

In [None]:
#raw layer includes some negative int values - in adata.raw.X
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = -145
raw.X = sparse.csr_matrix(raw.X)
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X.data))
save_and_test(adata, 'False')

In [None]:
#raw layer includes some negative int values - in adata.X
adata = sc.read_h5ad("../valid.h5ad")
adata.X = raw.X.toarray()
adata.X[0:5] = -145
adata.X = sparse.csr_matrix(adata.X)
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = 0
raw.X = sparse.csr_matrix(raw.X)
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X.data))
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X is csc and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = 0
raw.X = sparse.csc_matrix(raw.X)
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X.data))
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.raw.X is np.ndarray and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = 0
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X[~np.all(adata[0:5,:].raw.X == 0, axis=1)]))
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
adata.X[0:5] = 0
adata.X = sparse.csr_matrix(adata.X)
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
adata.X[0:5] = 0
adata.X = sparse.csc_matrix(adata.X)
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

In [None]:
# The raw counts in adata.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
adata.X[0:5] = 0
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X[~np.all(adata[0:5,:].X == 0, axis=1)]))
save_and_test(adata, 'False')