In [47]:
#https://github.com/chanzuckerberg/single-cell-curation/issues/516
import numpy as np
import os
import scanpy as sc
import subprocess

In [48]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [49]:
def save_and_test(adata, expected):
    if 'feature_length' in adata.obs.columns:
        print(adata.obs[['feature_length','feature_biotype']].value_counts())
    else:
        print('feature_length is absent')
    print('------------------')

    adata.write(filename='test.h5ad')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [50]:
adata = sc.read_h5ad('../valid.h5ad') #backed='r' would be slightly quicker but produces an error with multiple writes

**Test valid cases**

In [51]:
adata.obs['feature_length'] = 100
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'True')

tissue_type  tissue_ontology_term_id
tissue       UBERON:0004784             3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.085725 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [52]:
adata.obs['tissue_type'] = 'organoid'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'True')

tissue_type  tissue_ontology_term_id
organoid     UBERON:0004784             3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.085846 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [53]:
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034'
save_and_test(adata, 'True')

tissue_type   tissue_ontology_term_id
cell culture  CL:0000034                 3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.088309 with status is_valid=True
------------------
[1m[92mPASSED[0m


**Test invalid cases**

In [54]:
adata.obs['tissue_type'] = 'Tissue'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
Tissue       CL:0000034                 3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.163020 with status is_valid=True
------------------
[1m[91mERROR[0m


In [55]:
adata.obs['tissue_type'] = np.NaN
save_and_test(adata, 'False')

Series([], dtype: int64)
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.084717 with status is_valid=True
------------------
[1m[91mERROR[0m


In [56]:
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'False')

tissue_type   tissue_ontology_term_id
cell culture  UBERON:0004784             3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.085726 with status is_valid=True
------------------
[1m[91mERROR[0m


In [57]:
adata.obs['tissue_type'] = 'organoid'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
organoid     CL:0000034                 3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.097522 with status is_valid=True
------------------
[1m[91mERROR[0m


In [58]:
adata.obs['tissue_type'] = 'tissue'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
tissue       CL:0000034                 3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.089813 with status is_valid=True
------------------
[1m[91mERROR[0m


In [59]:
adata.obs['tissue_type'] = 'tissue'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001062'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
tissue       UBERON:0001062             3000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.106240 with status is_valid=True
------------------
[1m[91mERROR[0m


In [60]:
adata.obs.drop(columns=['tissue_type'], inplace=True)
save_and_test(adata, 'False')

tissue_type absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.090693 with status is_valid=True
------------------
[1m[91mERROR[0m
