In [1]:
#https://github.com/chanzuckerberg/single-cell-curation/issues/517
#https://lattice.atlassian.net/browse/WRN-688
#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#cell_type_ontology_term_id

import numpy as np
import os
import scanpy as sc
import subprocess

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [3]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')
    if 'cell_type_ontology_term_id' in adata.obs.columns:
        print(adata.obs['cell_type_ontology_term_id'].value_counts())
    else:
        print('cell_type_ontology_term_id is absent')
    print('------------------')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [4]:
adata = sc.read_h5ad('../valid.h5ad') #backed='r' would be slightly quicker but produces an error with multiple writes

**Test valid cases**

In [5]:
adata.obs['cell_type_ontology_term_id'] = 'CL:0000003'
save_and_test(adata, 'True')

CL:0000003    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.634853 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [22]:
# CL term (CL:0000000)
adata.obs['cell_type_ontology_term_id'] = 'CL:0000000'
save_and_test(adata, 'True')

CL:0000000    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.635489 with status is_valid=True
------------------
[1m[92mPASSED[0m


**Test invalid cases**

In [6]:
# empty str
adata.obs['cell_type_ontology_term_id'] = ''
save_and_test(adata, 'False')

    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: '' in 'cell_type_ontology_term_id' is not a valid ontology term id of 'CL'.
Validation complete in 0:00:00.306711 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [7]:
# NaN value
adata.obs['cell_type_ontology_term_id'] = np.NaN
save_and_test(adata, 'False')

Series([], Name: cell_type_ontology_term_id, dtype: int64)
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Column 'cell_type_ontology_term_id' in dataframe 'obs' must not contain NaN values.
Validation complete in 0:00:00.304265 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [9]:
# non-CL ontology term
adata.obs['cell_type_ontology_term_id'] = 'UBERON:0004784'
save_and_test(adata, 'False')

UBERON:0004784    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'UBERON:0004784' in 'cell_type_ontology_term_id' is not a valid ontology term id of 'CL'.
Validation complete in 0:00:00.307715 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [11]:
# no column
adata.obs.drop(columns=['cell_type_ontology_term_id'], inplace=True)
save_and_test(adata, 'False') 

cell_type_ontology_term_id is absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Dataframe 'obs' is missing column 'cell_type_ontology_term_id'.
Validation complete in 0:00:00.304849 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [10]:
# eukaryotic cell "CL:0000255"
adata.obs['cell_type_ontology_term_id'] = 'CL:0000255'
save_and_test(adata, 'False')

CL:0000255    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.635245 with status is_valid=True
------------------
[1m[91mERROR[0m


In [12]:
# Eumycetozoan cell "CL:0000257"
adata.obs['cell_type_ontology_term_id'] = 'CL:0000257'
save_and_test(adata, 'False')

CL:0000257    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.641226 with status is_valid=True
------------------
[1m[91mERROR[0m


In [13]:
# Animal cell "CL:0000548"
adata.obs['cell_type_ontology_term_id'] = 'CL:0000548'
save_and_test(adata, 'False')

CL:0000548    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.634580 with status is_valid=True
------------------
[1m[91mERROR[0m


In [23]:
# Wrong number of 0's in CL term
adata.obs['cell_type_ontology_term_id'] = 'CL:00000123'
save_and_test(adata, 'False')

CL:00000123    2000
Name: cell_type_ontology_term_id, dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: 'CL:00000123' in 'cell_type_ontology_term_id' is not a valid ontology term id of 'CL'.
Validation complete in 0:00:00.307677 with status is_valid=False
------------------
[1m[92mPASSED[0m
