https://github.com/chanzuckerberg/single-cell-curation/issues/590<br>
https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#obsm-embeddings

In [1]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [61]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')

    print("Keys in obsm: {}".format(adata.obsm.keys()))
    for k in adata.obsm.keys():
        print("Dimensions and dtype of {}:\t{}\t{}".format(k, adata.obsm[k].shape, adata.obsm[k].dtype))
    print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

## Test Valid Cases

In [63]:
# Make sure valid.h5ad is still valid, where X_umap is float and X_harmony is int
adata = sc.read_h5ad("../valid.h5ad")
save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.639563 with status is_valid=True
------------------
[1m[92mPASSED[0m


## Test Invalid Cases

In [29]:
# Embedding has only 1 dimension
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = adata.obsm['X_umap'][:,1]
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and shape of X_harmony:	(2000, 2)
Dimensions and shape of X_umap:	(2000,)
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: All embeddings must have as many rows as cells, and at least two columns.'adata.obsm['X_umap']' has shape of '(2000,)'.
Validation complete in 0:00:00.308411 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [62]:
# Embedding is a string
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype('str')
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	object
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.643883 with status is_valid=True
------------------
[1m[91mERROR[0m


In [70]:
# Embedding cannot contain any positive infinity
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'][0:100,1] = np.inf
print("Contains np.inf")
print(adata.obsm['X_umap'][0:10,:])
save_and_test(adata, 'False')

Contains np.inf
[[-0.47524262         inf]
 [-3.89065357         inf]
 [10.62553437         inf]
 [ 0.65694539         inf]
 [-2.15557401         inf]
 [ 2.24644435         inf]
 [ 2.49312419         inf]
 [10.7198808          inf]
 [16.89689347         inf]
 [ 0.07814408         inf]]
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.655359 with status is_valid=True
------------------
[1m[91mERROR[0m


In [71]:
# Embedding cannot contain any negative infinity
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'][0:100,1] = np.NINF
print("Contains np.np.NINF")
print(adata.obsm['X_umap'][0:10,:])
save_and_test(adata, 'False')

Contains np.np.NINF
[[-0.47524262        -inf]
 [-3.89065357        -inf]
 [10.62553437        -inf]
 [ 0.65694539        -inf]
 [-2.15557401        -inf]
 [ 2.24644435        -inf]
 [ 2.49312419        -inf]
 [10.7198808         -inf]
 [16.89689347        -inf]
 [ 0.07814408        -inf]]
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.693057 with status is_valid=True
------------------
[1m[91mERROR[0m


In [None]:
# Embedding cannot contain all np.nan values
adata = sc.read_h5ad("../valid.h5ad")
all_nan = np.full(adata.obsm['X_umap'].shape, np.nan)
adata.obsm['X_umap'] = all_nan
print("Number of np.nan values in X_umap of shape {}:\t{}".format(adata.obsm['X_umap'].shape, np.count_nonzero(np.isnan(all_nan))))
save_and_test(adata, 'False')

Number of np.nan values in X_umap of shape (2000, 2):	4000
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
