https://github.com/chanzuckerberg/single-cell-curation/issues/519

In [6]:
import numpy as np
import os
import scanpy as sc
import subprocess

In [7]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [8]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')
    print(adata.var.keys())
    print('------------------')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [9]:
adata = sc.read_h5ad('../valid.h5ad')

### Test Valid Cases

In [10]:
# non-reserved metadata key
adata.var['feature_is_filtered'] = False
save_and_test(adata, 'True')
# feature_is_filtered is not dropped so that only thing preventing validation is modifications

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.880320 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [11]:
# non-duplicate, non-reserved metadata keys
adata.var['feature_is_filtered'] = False
adata.var['feature_is_filtered_1'] = False
save_and_test(adata, 'True')
adata.var.drop(columns=['feature_is_filtered_1'], inplace=True)

Index(['Unnamed: 0', 'feature_is_filtered', 'feature_is_filtered_1'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.870057 with status is_valid=True
------------------
[1m[92mPASSED[0m


### Test Invalid Cases 

In [12]:
# reserved metadata key citation
adata.uns['citation'] = 'test'
save_and_test(adata, 'False')
del adata.uns['citation']

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.865374 with status is_valid=True
------------------
[1m[91mERROR[0m


In [13]:
# reserved metadata key feature_length
adata.var['feature_length'] = 'test'
save_and_test(adata, 'False')
adata.var.drop(columns=['feature_length'], inplace=True)

Index(['Unnamed: 0', 'feature_is_filtered', 'feature_length'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.866196 with status is_valid=True
------------------
[1m[91mERROR[0m


In [14]:
# reserved metadata key schema_reference
adata.uns['schema_reference'] = 'test'
save_and_test(adata, 'False')
del adata.uns['schema_reference']

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.867357 with status is_valid=True
------------------
[1m[91mERROR[0m


In [16]:
# non-reserved metadata key that starts with '__'
adata.var['__feature_is_filtered'] = False
save_and_test(adata, 'False')
adata.var.drop(columns=['__feature_is_filtered'], inplace=True)

Index(['Unnamed: 0', 'feature_is_filtered', '__feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__feature_is_filtered' in 'var' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.488876 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [17]:
# reserved metadata key that starts with '__'
adata.var['__schema_reference'] = 'test'
save_and_test(adata, 'False')
adata.var.drop(columns=['__schema_reference'], inplace=True)

Index(['Unnamed: 0', 'feature_is_filtered', '__schema_reference'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__schema_reference' in 'var' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.482974 with status is_valid=False
------------------
[1m[92mPASSED[0m
