https://github.com/chanzuckerberg/single-cell-curation/issues/519

In [None]:
import numpy as np
import os
import scanpy as sc
import subprocess

In [None]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [None]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')
    print(adata.var.keys())
    print('------------------')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [None]:
adata = sc.read_h5ad('valid.h5ad')

### Test Valid Cases

In [None]:
# non-reserved metadata key
adata.var['feature_is_filtered'] = False
save_and_test(adata, 'True')
# feature_is_filtered is not dropped so that only thing preventing validation is modifications

In [None]:
# non-duplicate, non-reserved metadata keys
adata.var['feature_is_filtered'] = False
adata.var['feature_is_filtered_1'] = False
save_and_test(adata, 'True')
adata.var.drop(columns=['feature_is_filtered_1'], inplace=True)

### Test Invalid Cases 

In [None]:
# reserved metadata key citation
adata.var['citation'] = 'test'
save_and_test(adata, 'False')
adata.var.drop(columns=['citation'], inplace=True)

In [None]:
# reserved metadata key feature_length
adata.var['feature_length'] = 'test'
save_and_test(adata, 'False')
adata.var.drop(columns=['feature_length'], inplace=True)
del adata.var['feature_length']

In [None]:
# reserved metadata key schema_reference
adata.var['schema_reference'] = 'test'
save_and_test(adata, 'False')
adata.var.drop(columns=['schema_reference'], inplace=True)

In [None]:
# duplicate non-reserved metadata keys
adata.var['feature_is_filtered'] = False
adata.var['feature_is_filtered_1'] = False
adata.var.rename(columns={'feature_is_filtered_1':'feature_is_filtered'}, inplace = True)
save_and_test(adata, 'False')
adata.var.drop(columns=['feature_is_filtered'], inplace=True) # drop both duplicate feature_is_filtered
adata.var['feature_is_filtered'] = False # replace feature_is_filtered so dataset validates by default

In [None]:
# non-reserved metadata key that starts with '__'
adata.var['__feature_is_filtered'] = False
save_and_test(adata, 'False')
adata.var.drop(columns=['__feature_is_filtered'], inplace=True)

In [None]:
# reserved metadata key that starts with '__'
adata.var['__schema_reference'] = 'test'
save_and_test(adata, 'False')
adata.var.drop(columns=['__schema_reference'], inplace=True)