https://github.com/chanzuckerberg/single-cell-curation/issues/590<br>
https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#obsm-embeddings

In [2]:
#import cxg_upload
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse
from datetime import datetime

In [3]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [4]:
def save_and_test(adata, expected):
    now = datetime.now() 
    dt_string = now.strftime('%m/%d %H:%M')
    adata.uns['title'] += (' - ' + dt_string)

    file = 'test.h5ad'
    adata.write(filename=file)
    adata = sc.read_h5ad(file)

    print("Keys in obsm: {}".format(adata.obsm.keys()))
    for k in adata.obsm.keys():
        print("Dimensions and dtype of {}:\t{}\t{}".format(k, adata.obsm[k].shape, adata.obsm[k].dtype))
    print('------------------')

    valid = validate(file)
    print('------------------')
    
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
        #if expected == 'True':
            #cxg_upload.upload(file)
    os.remove(file)

## Test Valid Cases

In [6]:
# Make sure valid.h5ad is still valid, where X_umap is float and X_harmony is int
adata = sc.read_h5ad("../valid.h5ad")
adata.uns['title'] = '590 X_umap is float and X_harmony is int'

save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.629539 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [7]:
# 32 bit
adata = sc.read_h5ad("../valid.h5ad")
adata.uns['title'] = '590 32bit'

adata.obsm['X_harmony'] = adata.obsm['X_harmony'].astype("int32")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype("float32")
save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int32
Dimensions and dtype of X_umap:	(2000, 2)	float32
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.632606 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [8]:
# 16 bit
adata = sc.read_h5ad("../valid.h5ad")
adata.uns['title'] = '590 16bit'

adata.obsm['X_harmony'] = adata.obsm['X_harmony'].astype("int16")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype("float16")
save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int16
Dimensions and dtype of X_umap:	(2000, 2)	float16
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.685250 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [9]:
# 8 bit (float8 is not a thing)
adata = sc.read_h5ad("../valid.h5ad")
adata.uns['title'] = '590 8bit'

adata.obsm['X_harmony'] = adata.obsm['X_harmony'].astype("int8")
save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int8
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.644642 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [10]:
#some, but not all, np.nan values
adata = sc.read_h5ad("../valid.h5ad")
adata.uns['title'] = '590 some, but not all, np.nan values'

coord1 = [4] + [np.nan] * (adata.obsm['X_umap'].shape[0] - 1)
coord2 = [-54] + [np.nan] * (adata.obsm['X_umap'].shape[0] - 1)
adata.obsm['X_umap'] = np.column_stack((coord1, coord2))

del adata.obsm['X_harmony']
save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_umap)
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.718067 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [23]:
# One of the obsm keys start with X_, one doesn't
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['harmony'] = adata.obsm['X_harmony']
del adata.obsm['X_harmony']
save_and_test(adata, 'True')

Keys in obsm: KeysView(AxisArrays with keys: X_umap, harmony)
Dimensions and dtype of X_umap:	(2000, 2)	float64
Dimensions and dtype of harmony:	(2000, 2)	int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:01.640864 with status is_valid=True
------------------
[1m[92mPASSED[0m


## Test Invalid Cases

In [11]:
# Embedding has only 1 dimension
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = adata.obsm['X_umap'][:,1]
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000,)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: All embeddings must have as many rows as cells, and at least two columns. 'adata.obsm['X_umap']' has shape of '(2000,)'.
Validation complete in 0:00:00.524231 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [12]:
# Embedding is a string
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = adata.obsm['X_umap'].astype('str')
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	object
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: adata.obsm['X_umap'] has an invalid data type. It should be float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits).
Validation complete in 0:00:00.539631 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [13]:
# Embedding cannot contain any positive infinity
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'][0:100,1] = np.inf
print("Contains np.inf")
print(adata.obsm['X_umap'][0:10,:])
save_and_test(adata, 'False')

Contains np.inf
[[-0.47524262         inf]
 [-3.89065357         inf]
 [10.62553437         inf]
 [ 0.65694539         inf]
 [-2.15557401         inf]
 [ 2.24644435         inf]
 [ 2.49312419         inf]
 [10.7198808          inf]
 [16.89689347         inf]
 [ 0.07814408         inf]]
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: adata.obsm['X_umap'] contains positive infinity or negative infinity values.
Validation complete in 0:00:00.524756 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [14]:
# Embedding cannot contain any negative infinity
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'][0:100,1] = np.NINF
print("Contains np.np.NINF")
print(adata.obsm['X_umap'][0:10,:])
save_and_test(adata, 'False')

Contains np.np.NINF
[[-0.47524262        -inf]
 [-3.89065357        -inf]
 [10.62553437        -inf]
 [ 0.65694539        -inf]
 [-2.15557401        -inf]
 [ 2.24644435        -inf]
 [ 2.49312419        -inf]
 [10.7198808         -inf]
 [16.89689347        -inf]
 [ 0.07814408        -inf]]
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: adata.obsm['X_umap'] contains positive infinity or negative infinity values.
Validation complete in 0:00:00.525169 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [15]:
# Embedding cannot contain all np.nan values
adata = sc.read_h5ad("../valid.h5ad")
all_nan = np.full(adata.obsm['X_umap'].shape, np.nan)
adata.obsm['X_umap'] = all_nan
print("Number of np.nan values in X_umap of shape {}:\t{}".format(adata.obsm['X_umap'].shape, np.count_nonzero(np.isnan(all_nan))))
save_and_test(adata, 'False')

Number of np.nan values in X_umap of shape (2000, 2):	4000
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: adata.obsm['X_umap'] contains all NaN values.
Validation complete in 0:00:00.531610 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [16]:
# Test embedding of obsm size zero
size_zero = np.empty(shape=(2000,0))
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_umap'] = size_zero
print("Size of X_umap: {}".format(adata.obsm['X_umap'].size))
save_and_test(adata, 'False')

Size of X_umap: 0
Keys in obsm: KeysView(AxisArrays with keys: X_harmony, X_umap)
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 0)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: The size of the ndarray stored for a 'adata.obsm['X_umap']' MUST NOT be zero.
ERROR: All embeddings must have as many rows as cells, and at least two columns. 'adata.obsm['X_umap']' has shape of '(2000, 0)'.
ERROR: adata.obsm['X_umap'] contains all NaN values.
Validation complete in 0:00:00.509780 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [17]:
# None of the obsm keys start with X_
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['umap'] = adata.obsm['X_umap']
adata.obsm['harmony'] = adata.obsm['X_harmony']
del adata.obsm['X_umap']
del adata.obsm['X_harmony']
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: harmony, umap)
Dimensions and dtype of harmony:	(2000, 2)	int64
Dimensions and dtype of umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: At least one embedding in 'obsm' has to have a key with an 'X_' prefix.
Validation complete in 0:00:00.515444 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [19]:
# obsm key is 'X_'
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_'] = adata.obsm['X_umap']
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_, X_harmony, X_umap)
Dimensions and dtype of X_:	(2000, 2)	float64
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Embedding key in 'adata.obsm' X_ must start with X_ and have a suffix at least one character long.
Validation complete in 0:00:00.514275 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [20]:
# obsm key is 'X_ '
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_ '] = adata.obsm['X_umap']
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_ , X_harmony, X_umap)
Dimensions and dtype of X_ :	(2000, 2)	float64
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Embedding key in 'adata.obsm' X_  must start with X_ and have a suffix at least one character long.
ERROR: Embedding key X_  has whitespace in it, please remove it.
Validation complete in 0:00:00.511037 with status is_valid=False
------------------
[1m[92mPASSED[0m


In [22]:
# obsm key has a whitespace
adata = sc.read_h5ad("../valid.h5ad")
adata.obsm['X_ U M A P'] = adata.obsm['X_umap']
save_and_test(adata, 'False')

Keys in obsm: KeysView(AxisArrays with keys: X_ U M A P, X_harmony, X_umap)
Dimensions and dtype of X_ U M A P:	(2000, 2)	float64
Dimensions and dtype of X_harmony:	(2000, 2)	int64
Dimensions and dtype of X_umap:	(2000, 2)	float64
------------------
Loading dependencies
Loading validator modules

Starting validation...
ERROR: Embedding key X_ U M A P has whitespace in it, please remove it.
Validation complete in 0:00:00.515069 with status is_valid=False
------------------
[1m[92mPASSED[0m
