https://github.com/chanzuckerberg/single-cell-curation/issues/418

In [1]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [3]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')

    print("Shape of raw: {}".format(adata.raw.X.shape))
    print("Number of features in var: {}".format(len(adata.raw.var)))
    print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

## Test Valid Cases

In [4]:
# Make sure valid.h5ad is valid
adata = sc.read_h5ad("../valid.h5ad")
save_and_test(adata, 'True')

Shape of raw: (2000, 22356)
Number of features in var: 22356
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.647977 with status is_valid=True
------------------
[1m[92mPASSED[0m


## Test Invalid Cases

In [5]:
# Make sure that validator catches when there are more genes in adata.raw.X compared to adata.raw.var
adata = sc.read_h5ad('../valid.h5ad')
raw = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
l = len(raw.var)
adata = adata[:,0:l-356]
raw.var.drop(raw.var.index[l-356:l], inplace=True)
adata.raw = raw

save_and_test(adata, 'False')

Shape of raw: (2000, 22356)
Number of features in var: 22000
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.641024 with status is_valid=True
------------------
[1m[91mERROR[0m
