https://github.com/chanzuckerberg/single-cell-curation/issues/382

In [1]:
import numpy as np
import os
import scanpy as sc
import subprocess
import anndata as ad
from scipy import sparse

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [3]:
def last_values(x):
    if sparse.issparse(x):
        l = len(x.data)
        return(x.data[l-10:l])
    else:
        s = x.shape
        return(x[s[0]-1][s[1]-10:s[1]])

In [4]:
def get_format(x):
    if sparse.issparse(x):
        return(x.getformat())
    elif isinstance(x, np.ndarray):
        return("numpy array")

In [5]:
def save_and_test(adata, expected):
    if adata.raw:
        print("adata.raw present")
        print("Raw matrix data structure: {}".format(get_format(adata.raw.X)))
        print("Raw matrix max value:{}".format(adata.raw.X.max()))
        last_values_list = last_values(adata.raw.X)
        print("Last 10 raw values: {}".format(last_values_list))
    else:
        print("No adata.raw present")
        print("Raw matrix data structure: {}".format(get_format(adata.X)))
        print("Raw matrix max value:{}".format(adata.X.max()))
        last_values_list = last_values(adata.X)
        print("Last 10 raw values: {}".format(last_values_list))
    print('------------------')

    adata.write(filename='test.h5ad')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

## Add float to > 10000th cell (adata.raw.X: csr, csc, and dense matrix), should not pass validation

In [6]:
# Create a larger object
adata = sc.read_h5ad('../valid.h5ad')
adata = ad.concat([adata, adata, adata, adata, adata, adata], uns_merge='first')
adata.obs_names_make_unique()
adata.var['feature_is_filtered'] = False

# Modify last 10 expression values into float for csr matrix in adata.raw.X
l = len(adata.raw.X.data)
adata.raw.X.data[l-10:l] = adata.raw.X.data[l-10:l] + 0.3
save_and_test(adata, 'False')

  utils.warn_names_duplicates("obs")


adata.raw present
Raw matrix data structure: csr
Raw matrix max value:157728.0
Last 10 raw values: [ 48.3  93.3  31.3 230.3 866.3 286.3 205.3 185.3  94.3  73.3]
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:03.606474 with status is_valid=True
------------------
[1m[91mERROR[0m


In [7]:
# Create a larger object
adata = sc.read_h5ad('../valid.h5ad')
adata = ad.concat([adata, adata, adata, adata, adata, adata], uns_merge='first')
adata.obs_names_make_unique()
adata.var['feature_is_filtered'] = False

# Modify last 10 expression values into float for csc matrix in adata.raw.X
raw = ad.AnnData(X=sparse.csc_matrix(adata.raw.X), obs=adata.obs, var=adata.raw.var)
l = len(adata.raw.X.data)
raw.X.data[l-10:l] = raw.X.data[l-10:l] + 0.3
adata.raw = raw
save_and_test(adata, 'False')

  utils.warn_names_duplicates("obs")


adata.raw present
Raw matrix data structure: csc
Raw matrix max value:157728.0
Last 10 raw values: [  3.3  10.3   1.3  37.3   2.3  94.3  30.3  43.3  11.3 123.3]
------------------
Loading dependencies
Loading validator modules

Starting validation...
Matrices are in CSC format; loading entire dataset into memory.
Validation complete in 0:00:03.075446 with status is_valid=True
------------------
[1m[91mERROR[0m


In [8]:
# Create a larger object
adata = sc.read_h5ad('../valid.h5ad')
adata = ad.concat([adata, adata, adata, adata, adata, adata], uns_merge='first')
adata.obs_names_make_unique()
adata.var['feature_is_filtered'] = False

# Modify last 10 expression values into float for dense matrix in adata.raw.X
raw = ad.AnnData(X=adata.raw.X.todense(), obs=adata.obs, var=adata.raw.var)
s = raw.X.shape
raw.X[s[0]-1][s[1]-10:s[1]] = raw.X[s[0]-1][s[1]-10:s[1]] + 0.3
adata.raw = raw
save_and_test(adata, 'False')

  utils.warn_names_duplicates("obs")


adata.raw present
Raw matrix data structure: numpy array
Raw matrix max value:157728.0
Last 10 raw values: [  0.3   0.3   0.3 185.3   0.3   0.3  94.3  73.3   0.3   0.3]
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:04.759766 with status is_valid=True
------------------
[1m[91mERROR[0m


## Add float to > 10000th cell (adata.X: csr, csc, and dense matrix), should not pass validation

In [9]:
# Create a larger object
adata = sc.read_h5ad('../valid.h5ad')
adata = ad.concat([adata, adata, adata, adata, adata, adata], uns_merge='first')
adata.obs_names_make_unique()
adata.var['feature_is_filtered'] = False

# Modify last 10 expression values into float for csr matrix in adata.X
l = len(adata.raw.X.data)
adata.raw.X.data[l-10:l] = adata.raw.X.data[l-10:l] + 0.3
adata.X = adata.raw.X
del adata.raw
save_and_test(adata, 'False')

  utils.warn_names_duplicates("obs")


No adata.raw present
Raw matrix data structure: csr
Raw matrix max value:157728.0
Last 10 raw values: [ 48.3  93.3  31.3 230.3 866.3 286.3 205.3 185.3  94.3  73.3]
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:02.974905 with status is_valid=True
------------------
[1m[91mERROR[0m


In [10]:
# Create a larger object
adata = sc.read_h5ad('../valid.h5ad')
adata = ad.concat([adata, adata, adata, adata, adata, adata], uns_merge='first')
adata.obs_names_make_unique()
adata.var['feature_is_filtered'] = False

# Modify last 10 expression values into float for csc matrix in adata.X
raw = ad.AnnData(X=sparse.csc_matrix(adata.raw.X), obs=adata.obs, var=adata.raw.var)
l = len(adata.raw.X.data)
raw.X.data[l-10:l] = raw.X.data[l-10:l] + 0.3
adata.X = raw.X
del adata.raw
save_and_test(adata, 'False')

  utils.warn_names_duplicates("obs")


No adata.raw present
Raw matrix data structure: csc
Raw matrix max value:157728.0
Last 10 raw values: [  3.3  10.3   1.3  37.3   2.3  94.3  30.3  43.3  11.3 123.3]
------------------
Loading dependencies
Loading validator modules

Starting validation...
Matrices are in CSC format; loading entire dataset into memory.
Validation complete in 0:00:02.510994 with status is_valid=True
------------------
[1m[91mERROR[0m


In [11]:
# Create a larger object
adata = sc.read_h5ad('../valid.h5ad')
adata = ad.concat([adata, adata, adata, adata, adata, adata], uns_merge='first')
adata.obs_names_make_unique()
adata.var['feature_is_filtered'] = False

# Modify last 10 expression values into float for dense matrix in adata.X
raw = ad.AnnData(X=adata.raw.X.todense(), obs=adata.obs, var=adata.raw.var)
s = raw.X.shape
raw.X[s[0]-1][s[1]-10:s[1]] = raw.X[s[0]-1][s[1]-10:s[1]] + 0.3
adata.X = raw.X
del adata.raw
save_and_test(adata, 'False')

  utils.warn_names_duplicates("obs")


No adata.raw present
Raw matrix data structure: numpy array
Raw matrix max value:157728.0
Last 10 raw values: [  0.3   0.3   0.3 185.3   0.3   0.3  94.3  73.3   0.3   0.3]
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:04.058138 with status is_valid=True
------------------
[1m[91mERROR[0m
