https://github.com/chanzuckerberg/single-cell-curation/issues/590<br>
https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#obsm-embeddings

In [None]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse

In [None]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [None]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')

    print("Keys in obsm: {}".format(adata.obsm.keys()))
    for k in adata.obsm.keys():
        print("Dimensions and dtype of {}:\t{}\t{}".format(k, adata.obsm[k].shape, adata.obsm[k].dtype))
    print('------------------')

    valid = validate('test.h5ad')
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

## Test Valid Cases

In [None]:
# Make sure valid.h5ad is still valid, where X_umap is float and X_harmony is int
adata = sc.read_h5ad("../valid.h5ad")
save_and_test(adata, 'True')

In [None]:
# 32 bit
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_harmony'] = adata.obsm['X_harmony'].astype("int32")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype("float32")
save_and_test(adata, 'True')

In [None]:
# 16 bit
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_harmony'] = adata.obsm['X_harmony'].astype("int16")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype("float16")
save_and_test(adata, 'True')

In [None]:
# 8 bit (float8 is not a thing)
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_harmony'] = adata.obsm['X_harmony'].astype("int8")
save_and_test(adata, 'True')

## Test Invalid Cases

In [None]:
# Embedding has only 1 dimension
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = adata.obsm['X_umap'][:,1]
save_and_test(adata, 'False')

In [None]:
# Embedding is a string
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype('str')
save_and_test(adata, 'False')

In [None]:
# Embedding cannot contain any positive infinity
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'][0:100,1] = np.inf
print("Contains np.inf")
print(adata.obsm['X_umap'][0:10,:])
save_and_test(adata, 'False')

In [None]:
# Embedding cannot contain any negative infinity
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'][0:100,1] = np.NINF
print("Contains np.np.NINF")
print(adata.obsm['X_umap'][0:10,:])
save_and_test(adata, 'False')

In [None]:
# Embedding cannot contain all np.nan values
adata = sc.read_h5ad("../valid.h5ad")
all_nan = np.full(adata.obsm['X_umap'].shape, np.nan)
adata.obsm['X_umap'] = all_nan
print("Number of np.nan values in X_umap of shape {}:\t{}".format(adata.obsm['X_umap'].shape, np.count_nonzero(np.isnan(all_nan))))
save_and_test(adata, 'False')

In [None]:
# Test embedding of obsm size zero
size_zero = np.empty(shape=(2000,0))
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = size_zero
print("Size of X_umap: {}".format(adata.obsm['X_umap'].size))
save_and_test(adata, 'False')

In [None]:
# None of the obsm keys start with X_
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['umap'] = adata.obsm['X_umap']
adata.obsm['harmony'] = adata.obsm['X_harmony']
del adata.obsm['X_umap']
del adata.obsm['X_harmony']
save_and_test(adata, 'False')

In [None]:
# obsm key is 'X_'
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_'] = adata.obsm['X_umap']
save_and_test(adata, 'False')

In [None]:
# obsm key is 'X_ '
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_ '] = adata.obsm['X_umap']
save_and_test(adata, 'False')