https://github.com/chanzuckerberg/single-cell-curation/issues/514<br>
https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#tissue_type

In [1]:
import numpy as np
import os
import scanpy as sc
import subprocess

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [3]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    test_adata = sc.read_h5ad('test.h5ad')
    if 'tissue_type' in test_adata.obs.columns:
        print(test_adata.obs[['tissue_type','tissue_ontology_term_id']].value_counts())
    else:
        print('tissue_type absent')
    print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [4]:
adata = sc.read_h5ad('../valid.h5ad') #backed='r' would be slightly quicker but produces an error with multiple writes

**Test valid cases**

In [5]:
#tissue/UBERON
adata.obs['tissue_type'] = 'tissue'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'True')

tissue_type  tissue_ontology_term_id
tissue       UBERON:0004784             2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.841347 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [6]:
#organoid/UBERON
adata.obs['tissue_type'] = 'organoid'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'True')

tissue_type  tissue_ontology_term_id
organoid     UBERON:0004784             2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.832184 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [7]:
#cell culture/CL
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034'
save_and_test(adata, 'True')

tissue_type   tissue_ontology_term_id
cell culture  CL:0000034                 2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.836515 with status is_valid=True
------------------
[1m[92mPASSED[0m


**Test invalid cases**

In [8]:
#Tissue
adata.obs['tissue_type'] = 'Tissue'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
Tissue       UBERON:0004784             2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Column 'tissue_type' in dataframe 'obs' contains invalid values '['Tissue']'. Values must be one of ['cell culture', 'organoid', 'tissue']
Validation complete in 0:00:00.502881 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [9]:
#null
adata.obs['tissue_type'] = np.NaN
save_and_test(adata, 'False')

Series([], dtype: int64)
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Column 'tissue_type' in dataframe 'obs' must be categorical, not float64.
ERROR: Column 'tissue_type' in dataframe 'obs' contains invalid values '[nan]'. Values must be one of ['cell culture', 'organoid', 'tissue']
Validation complete in 0:00:00.490853 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [10]:
#cell culture/UBERON
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'False')

tissue_type   tissue_ontology_term_id
cell culture  UBERON:0004784             2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'UBERON:0004784' in 'tissue_ontology_term_id' is not a valid ontology term id of 'CL'. When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), nor 'CL:0000548' (animal cell).
Validation complete in 0:00:00.488373 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [17]:
#cell culture/eukaryotic cell "CL:0000255"
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'CL:0000255'
save_and_test(adata, 'False')

tissue_type   tissue_ontology_term_id
cell culture  CL:0000255                 2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:0000255' in 'tissue_ontology_term_id' is not allowed. When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), nor 'CL:0000548' (animal cell).
Validation complete in 0:00:00.565188 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [18]:
#cell culture/Eumycetozoan cell "CL:0000257"
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'CL:0000257'
save_and_test(adata, 'False')

tissue_type   tissue_ontology_term_id
cell culture  CL:0000257                 2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:0000257' in 'tissue_ontology_term_id' is not allowed. When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), nor 'CL:0000548' (animal cell).
Validation complete in 0:00:00.538268 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [19]:
#cell culture/animal cell "CL:0000548"
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'CL:0000548'
save_and_test(adata, 'False')

tissue_type   tissue_ontology_term_id
cell culture  CL:0000548                 2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:0000548' in 'tissue_ontology_term_id' is not allowed. When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), nor 'CL:0000548' (animal cell).
Validation complete in 0:00:00.483191 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [11]:
#organoid/CL
adata.obs['tissue_type'] = 'organoid'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
organoid     CL:0000034                 2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:0000034' in 'tissue_ontology_term_id' is not a valid ontology term id of 'UBERON'. When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
Validation complete in 0:00:00.487476 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [12]:
#tissue/CL
adata.obs['tissue_type'] = 'tissue'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
tissue       CL:0000034                 2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:0000034' in 'tissue_ontology_term_id' is not a valid ontology term id of 'UBERON'. When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
Validation complete in 0:00:00.546616 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [13]:
#too broad UBERON/tissue
adata.obs['tissue_type'] = 'tissue'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001062'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
tissue       UBERON:0001062             2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'UBERON:0001062' in 'tissue_ontology_term_id' is not a child term id of '[['UBERON:0001062']]'. When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
Validation complete in 0:00:00.493438 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [20]:
#too broad UBERON/organoid
adata.obs['tissue_type'] = 'organoid'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001062'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id
organoid     UBERON:0001062             2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'UBERON:0001062' in 'tissue_ontology_term_id' is not a child term id of '[['UBERON:0001062']]'. When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
Validation complete in 0:00:00.496339 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [14]:
#no tissue_type
adata.obs.drop(columns=['tissue_type'], inplace=True)
save_and_test(adata, 'False')

tissue_type absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Checking values with dependencies failed for adata.obs['tissue_ontology_term_id'], this is likely due to missing dependent column in adata.obs.
ERROR: Dataframe 'obs' is missing column 'tissue_type'.
Validation complete in 0:00:00.596750 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [15]:
#current notation
adata.obs['tissue_type'] = 'organoid'
adata.obs['tissue_ontology_term_id'] = 'UBERON:0001062 (organoid)'
save_and_test(adata, 'False')

tissue_type  tissue_ontology_term_id  
organoid     UBERON:0001062 (organoid)    2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'UBERON:0001062 (organoid)' in 'tissue_ontology_term_id' is not a valid ontology term id of 'UBERON'. When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
Validation complete in 0:00:00.487389 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [16]:
#current notation
adata.obs['tissue_type'] = 'cell culture'
adata.obs['tissue_ontology_term_id'] = 'CL:0000034 (cell culture)'
save_and_test(adata, 'False')

tissue_type   tissue_ontology_term_id  
cell culture  CL:0000034 (cell culture)    2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:0000034 (cell culture)' in 'tissue_ontology_term_id' is not a valid ontology term id of 'CL'. When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), nor 'CL:0000548' (animal cell).
Validation complete in 0:00:00.487317 with status is_valid=False
------------------
[1m[92mPASSED[0m
