https://github.com/chanzuckerberg/single-cell-curation/issues/515

In [None]:
import numpy as np
import os
import scanpy as sc
import subprocess

In [None]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [None]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')
    if 'organism_ontology_term_id' in adata.obs.columns:
        print(adata.obs[['organism_ontology_term_id','self_reported_ethnicity_ontology_term_id']].value_counts())
    else:
        print('organism_ontology_term_id absent')
    print('------------------')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [None]:
adata = sc.read_h5ad('../valid.h5ad')

### Test Valid Cases 

In [None]:
# self_reported_ethnicty_ontology_term_id as unknown
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'unknown'
save_and_test(adata, 'True')

In [None]:
# comma separated list for self_reported_ethnicity_ontology_term_id
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005,HANCESTRO:0014'
save_and_test(adata, 'True')

In [None]:
# self_reported_ethnicity_ontology_term_id as na
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'na'
save_and_test(adata, 'True')

### Test Invalid Cases 

In [None]:
# orangism_ontology_term_id absent
adata.obs.drop(columns=['organism_ontology_term_id'], inplace=True)
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005'
save_and_test(adata, 'False')

In [None]:
# single self_reported_ethnicity_ontology_term_id term with leading space
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = ' HANCESTRO:0005'
save_and_test(adata, 'False')

In [None]:
# single self_reported_ethnicity_ontology_term_id term with trailing space
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005 '
save_and_test(adata, 'False')

In [None]:
# single self_reported_ethnicity_ontology_term_id term with leading and trailing space
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = ' HANCESTRO:0005 '
save_and_test(adata, 'False')

In [None]:
# list of self_reported_ethnicity_ontology_term_id terms with one having leading space
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005, HANCESTRO:0014'
save_and_test(adata, 'False')

In [None]:
# list of self_reported_ethnicity_ontology_term_id terms with one having trailing space
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005 ,HANCESTRO:0014'
save_and_test(adata, 'False')

In [None]:
# list of self_reported_ethnicity_ontology_term_id not in ascending lexical order
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0014,HANCESTRO:0005'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0002
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0002'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0003
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0003'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0004
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0004'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0018
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0018'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0290
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0290'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0304
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0304'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0323
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0323'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0324
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0324'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0551
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0551'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0554
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0554'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0555
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0555'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0557
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0557'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0558
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0558'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0559
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0559'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0560
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0560'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0561
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0561'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0564
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0564'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0565
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0565'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0566
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0566'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as GEO:000000374
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'GEO:000000374'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0029
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0029'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0030
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0030'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0031
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0031'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0032
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0032'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0033
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0033'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0034
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0034'
save_and_test(adata, 'False')

In [None]:
# list of self_reported_ethnicity_ontology_term_id with valid (HANCESTRO:0005,HANCESTRO:0014) and invalid (HANCESTRO:0034) ids
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005,HANCESTRO:0014,HANCESTRO:0034'
save_and_test(adata, 'False')

In [None]:
# list of self_reported_ethnicity_ontology_term_id with invalid (HANCESTRO:0033,HANCESTRO:0034) ids
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0033,HANCESTRO:0034'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0051, child of HANCESTRO:0002
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0051'
save_and_test(adata, 'False')

In [None]:
# self_reported_ethnicity_ontology_term_id as HANCESTRO:0306, child of HANCESTRO:0304
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0306'
save_and_test(adata, 'False')

In [None]:
# list of self_reported_ethnicity_ontology_term_id with valid (HANCESTRO:0005) ids but duplicated
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005,HANCESTRO:0005'
save_and_test(adata, 'False')

In [None]:
# list of valid self_reported_ethnicity_ontology_term_id as dtype list
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:9606'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005,HANCESTRO:0014'
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].str.split()
save_and_test(adata, 'False')

In [None]:
# list of valid self_reported_ethnicity_ontology_term_id HANCESTRO terms with mouse for organism
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:10090'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0005,HANCESTRO:0014'
save_and_test(adata, 'False')

In [None]:
# valid HANCESTRO self_reported_ethnicity_ontology_term_id with mouse for organism
adata.obs['organism_ontology_term_id'] = 'NCBITaxon:10090'
adata.obs['self_reported_ethnicity_ontology_term_id'] = 'HANCESTRO:0014'
save_and_test(adata, 'False')