https://github.com/chanzuckerberg/single-cell-curation/issues/519

In [14]:
import numpy as np
import os
import scanpy as sc
import subprocess

In [15]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [16]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')
    print(adata.var.keys())
    print('------------------')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [17]:
adata = sc.read_h5ad('../valid.h5ad')

### Test Valid Cases

In [18]:
# non-reserved metadata key
adata.var['feature_is_filtered'] = False
save_and_test(adata, 'True')

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.833550 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [19]:
# non-duplicate, non-reserved metadata keys
adata = sc.read_h5ad('../valid.h5ad')
adata.var['feature_is_filtered_1'] = False
save_and_test(adata, 'True')

Index(['Unnamed: 0', 'feature_is_filtered', 'feature_is_filtered_1'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.836993 with status is_valid=True
------------------
[1m[92mPASSED[0m


### Test Invalid Cases 

In [20]:
# reserved metadata key citation
adata = sc.read_h5ad('../valid.h5ad')
adata.uns['citation'] = 'test'
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Column 'citation' is a reserved column name of 'uns'. Remove it from h5ad and try again.
Validation complete in 0:00:00.577517 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [21]:
# reserved metadata key feature_length
adata = sc.read_h5ad('../valid.h5ad')
adata.var['feature_length'] = 'test'
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered', 'feature_length'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Add labels error: Column 'feature_length' is a reserved column name of 'var'. Remove it from h5ad and try again.
Validation complete in 0:00:00.493258 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [22]:
# reserved metadata key schema_reference
adata = sc.read_h5ad('../valid.h5ad')
adata.uns['schema_reference'] = 'test'
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Column 'schema_reference' is a reserved column name of 'uns'. Remove it from h5ad and try again.
Validation complete in 0:00:00.485001 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [23]:
# non-reserved metadata key that starts with '__'
adata = sc.read_h5ad('../valid.h5ad')
adata.var['__feature_mightbe_filtered'] = False
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered', '__feature_mightbe_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__feature_mightbe_filtered' in 'var' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.495310 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [24]:
# reserved metadata key that starts with '__'
adata = sc.read_h5ad('../valid.h5ad')
adata.var['__schema_reference'] = 'test'
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered', '__schema_reference'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__schema_reference' in 'var' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.483374 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [25]:
# non-reserved metadata key that starts with '__'
adata = sc.read_h5ad('../valid.h5ad')
adata.raw.var['__myfield'] = False
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__myfield' in 'raw.var' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.488880 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [26]:
# non-reserved metadata key that starts with '__'
adata = sc.read_h5ad('../valid.h5ad')
adata.obs['__myfield'] = False
adata.obs['__myotherfield'] = True
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__myfield' in 'obs' is invalid. Fields that start with '__' are reserved.
ERROR: The field '__myotherfield' in 'obs' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.483493 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [27]:
# non-reserved metadata key that starts with '__'
adata = sc.read_h5ad('../valid.h5ad')
adata.uns['__myfield'] = False
save_and_test(adata, 'False')

Index(['Unnamed: 0', 'feature_is_filtered'], dtype='object')
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The field '__myfield' in 'uns' is invalid. Fields that start with '__' are reserved.
Validation complete in 0:00:00.482300 with status is_valid=False
------------------
[1m[92mPASSED[0m
