https://github.com/chanzuckerberg/single-cell-curation/issues/405

In [1]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse

In [2]:
def validate_add_label(outfile, infile):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', '--add-labels', outfile, infile], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [3]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [4]:
def save_and_test(adata, column, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')

    print("dtype of column of interest ({}): {}".format(column, adata.obs['test_column'].dtype))
    print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    print("Adding labels and writing to file")
    valid_label = validate_add_label('test_labeled.h5ad','test.h5ad')
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED VALIDATION\033[0m')
        if expected != valid_label:
            print('\033[1m\033[91mERROR\033[0m')
        else:
            print('\033[1m\033[92mPASSED ADD LABELS\033[0m')
    os.remove('test.h5ad')
    os.remove('test_labeled.h5ad')

## Test Valid Cases

In [5]:
# Validate and add labels to valid.h5ad with bool columns
adata = sc.read_h5ad("../valid.h5ad")
adata.obs['test_column'] = True
save_and_test(adata, 'test_column', 'True')

dtype of column of interest (test_column): bool
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.648464 with status is_valid=True
------------------
Adding labels and writing to file
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.651822 with status is_valid=True
------------------
[1m[92mPASSED VALIDATION[0m
[1m[92mPASSED ADD LABELS[0m


## Test Invalid Cases

In [6]:
# The original invalid file is the only file we can obtain with a column of category type containing boolean objects
# File is on google drive:
adata = sc.read_h5ad("/Users/jychien/Downloads/hdg_humansomatic2.h5ad")
save_and_test(adata, 'is_doublet', 'True')

TypeError: Can't implicitly convert non-string objects to strings

Above error raised while writing key 'categories' of <class 'h5py._hl.group.Group'> to /