https://github.com/chanzuckerberg/single-cell-curation/issues/614

In [1]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [3]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')

    if adata.raw:
        print("Datatype of raw matrix: {}".format(adata.raw.X.dtype))
        print("Format of raw matrix: {}".format(get_format(adata.raw.X)))
        print('------------------')
    else:
        print("Datatype of raw matrix: {}".format(adata.X.dtype))
        print("Format of raw matrix: {}".format(get_format(adata.X)))
        print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [4]:
def get_format(x):
    if sparse.issparse(x):
        return(x.getformat())
    elif isinstance(x, np.ndarray):
        return("numpy array")

## Test Valid Cases

In [5]:
# Double check that adata is valid
adata = sc.read_h5ad("../valid.h5ad")
save_and_test(adata, 'True')

Datatype of raw matrix: float32
Format of raw matrix: csr
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.679551 with status is_valid=True
------------------
[1m[92mPASSED[0m


## Test Invalid Cases

In [6]:
# The raw counts in adata.raw.X are integers for csr
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype(int)
adata.raw = raw
save_and_test(adata, 'False')

Datatype of raw matrix: int64
Format of raw matrix: csr
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.377719 with status is_valid=True
------------------
[1m[91mERROR[0m


In [7]:
# The raw counts in adata.X are integers for csr
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype(int)
del adata.raw
save_and_test(adata, 'False')

Datatype of raw matrix: int64
Format of raw matrix: csr
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.299035 with status is_valid=True
------------------
[1m[91mERROR[0m


In [8]:
# The raw counts in adata.raw.X are integers for csc
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype(int)
raw.X = sparse.csc_matrix(raw.X)
adata.raw = raw
save_and_test(adata, 'False')

Datatype of raw matrix: int64
Format of raw matrix: csc
------------------
Loading dependencies
Loading validator modules

Starting validation...
Matrices are in CSC format; loading entire dataset into memory.
Validation complete in 0:00:00.400113 with status is_valid=True
------------------
[1m[91mERROR[0m


In [9]:
# The raw counts in adata.X are integers for csc
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype(int)
adata.X = sparse.csc_matrix(adata.X)
del adata.raw
save_and_test(adata, 'False')

Datatype of raw matrix: int64
Format of raw matrix: csc
------------------
Loading dependencies
Loading validator modules

Starting validation...
Matrices are in CSC format; loading entire dataset into memory.
Validation complete in 0:00:00.312281 with status is_valid=True
------------------
[1m[91mERROR[0m


In [10]:
# The raw counts in adata.raw.X are integers for np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.astype(int)
raw.X = raw.X.toarray()
adata.raw = raw
save_and_test(adata, 'False')

Datatype of raw matrix: int64
Format of raw matrix: numpy array
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.521248 with status is_valid=True
------------------
[1m[91mERROR[0m


In [11]:
# The raw counts in adata.X are integers for np.ndarray
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.astype(int)
adata.X = adata.X.toarray()
del adata.raw
save_and_test(adata, 'False')

Datatype of raw matrix: int64
Format of raw matrix: numpy array
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.433554 with status is_valid=True
------------------
[1m[91mERROR[0m


In [12]:
# The raw counts in adata.raw.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = 0
raw.X = sparse.csr_matrix(raw.X)
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X.data))
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

Nonzero expression values in adata.raw.X for first 5 cells: []
Nonzero expression values in adata.X first 5 cells: [6.41681   5.462968  5.822448  ... 5.140746  3.9875417 3.667482 ]
Datatype of raw matrix: float32
Format of raw matrix: csr
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.646313 with status is_valid=True
------------------
[1m[91mERROR[0m


In [13]:
# The raw counts in adata.raw.X is csc and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = 0
raw.X = sparse.csc_matrix(raw.X)
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X.data))
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

Nonzero expression values in adata.raw.X for first 5 cells: []
Nonzero expression values in adata.X first 5 cells: [6.41681   5.462968  5.822448  ... 5.140746  3.9875417 3.667482 ]
Datatype of raw matrix: float32
Format of raw matrix: csc
------------------
Loading dependencies
Loading validator modules

Starting validation...
Matrices are in CSC format; loading entire dataset into memory.
Validation complete in 0:00:00.657570 with status is_valid=True
------------------
[1m[91mERROR[0m


In [14]:
# The raw counts in adata.raw.X is np.ndarray and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
raw.X = raw.X.toarray()
raw.X[0:5] = 0
adata.raw = raw
print("Nonzero expression values in adata.raw.X for first 5 cells: {}".format(adata[0:5,:].raw.X[~np.all(adata[0:5,:].raw.X == 0, axis=1)]))
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

Nonzero expression values in adata.raw.X for first 5 cells: []
Nonzero expression values in adata.X first 5 cells: [6.41681   5.462968  5.822448  ... 5.140746  3.9875417 3.667482 ]
Datatype of raw matrix: float32
Format of raw matrix: numpy array
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.943125 with status is_valid=True
------------------
[1m[91mERROR[0m


In [15]:
# The raw counts in adata.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
adata.X[0:5] = 0
adata.X = sparse.csr_matrix(adata.X)
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

Nonzero expression values in adata.X first 5 cells: []
Datatype of raw matrix: float32
Format of raw matrix: csr
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.569957 with status is_valid=True
------------------
[1m[91mERROR[0m


In [16]:
# The raw counts in adata.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
adata.X[0:5] = 0
adata.X = sparse.csc_matrix(adata.X)
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X.data))
save_and_test(adata, 'False')

Nonzero expression values in adata.X first 5 cells: []
Datatype of raw matrix: float32
Format of raw matrix: csc
------------------
Loading dependencies
Loading validator modules

Starting validation...
Matrices are in CSC format; loading entire dataset into memory.
Validation complete in 0:00:00.553223 with status is_valid=True
------------------
[1m[91mERROR[0m


In [17]:
# The raw counts in adata.X is csr and has zero counts for first 5 cells
adata = sc.read_h5ad("../valid.h5ad")
adata.X = adata.raw.X.toarray()
adata.X[0:5] = 0
del adata.raw
print("Nonzero expression values in adata.X first 5 cells: {}".format(adata[0:5,:].X[~np.all(adata[0:5,:].X == 0, axis=1)]))
save_and_test(adata, 'False')

Nonzero expression values in adata.X first 5 cells: []
Datatype of raw matrix: float32
Format of raw matrix: numpy array
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.864831 with status is_valid=True
------------------
[1m[91mERROR[0m
