In [1]:
# https://github.com/chanzuckerberg/single-cell-curation/issues/516
# https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#feature_length

import numpy as np
import os
import scanpy as sc
import subprocess

In [2]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [66]:
def save_and_test(adata, expected):
    if ('feature_length' in adata.obs.columns) and ('feature_biotype' in adata.obs.columns):
        print(adata.obs[['feature_biotype','feature_length']].info())
        print('-------------------')
        print(adata.obs[['feature_length','feature_biotype']].value_counts())
        
    elif 'feature_biotype' in adata.obs.columns: 
        print('feature_length is absent')
    
    else: 
        print('feature_biotype is absent')

    print('------------------')

    adata.write(filename='test.h5ad')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [4]:
adata = sc.read_h5ad('../valid.h5ad') #backed='r' would be slightly quicker but produces an error with multiple writes

**Test valid cases**

In [35]:
adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = 1000
save_and_test(adata, 'True')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
1000            gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.649407 with status is_valid=True
------------------
[1m[92mPASSED[0m


In [82]:
adata.obs['feature_biotype'] = 'spike-in'
adata.obs['feature_length'] = 0
save_and_test(adata, 'True')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
0               spike-in           2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.651693 with status is_valid=True
------------------
[1m[92mPASSED[0m


**Test invalid cases**

In [77]:
# feature_biotype = gene
# feature_length = absent column

adata.obs['feature_biotype'] = 'gene'
adata.obs.drop(columns=['feature_length'], inplace=True)
save_and_test(adata, 'False')

feature_length is absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.686513 with status is_valid=True
------------------
[1m[91mERROR[0m


In [79]:
# feature_biotype = spike-in
# feature_length = absent column

adata.obs['feature_biotype'] = 'spike-in'
adata.obs.drop(columns=['feature_length'], inplace=True)
save_and_test(adata, 'False')

feature_length is absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.664002 with status is_valid=True
------------------
[1m[91mERROR[0m


In [81]:
# feature_biotype = absent column
# feature_length = gene value

adata.obs.drop(columns='feature_biotype', inplace=True)
adata.obs['feature_length'] = 1000
save_and_test(adata, 'False')

feature_biotype is absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.687375 with status is_valid=True
------------------
[1m[91mERROR[0m


In [83]:
# feature_biotype = absent column
# feature_length = spike-in value

adata.obs.drop(columns='feature_biotype', inplace=True)
adata.obs['feature_length'] = 0
save_and_test(adata, 'False')

feature_biotype is absent
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.650419 with status is_valid=True
------------------
[1m[91mERROR[0m


In [84]:
# feature_biotype = gene
# feature_length = null values

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = np.NaN
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature_biotype  2000 non-null   object 
 1   feature_length   0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
Series([], dtype: int64)
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.689997 with status is_valid=True
------------------
[1m[91mERROR[0m


In [85]:
# feature_biotype = spike-in
# feature_length = null values

adata.obs['feature_biotype'] = 'spike-in'
adata.obs['feature_length'] = np.NaN
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature_biotype  2000 non-null   object 
 1   feature_length   0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
Series([], dtype: int64)
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.650022 with status is_valid=True
------------------
[1m[91mERROR[0m


In [43]:
# feature_biotype = gene
# feature_length = 0 (zero int value)

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = 0
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
0               gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.642831 with status is_valid=True
------------------
[1m[91mERROR[0m


In [94]:
# feature_biotype = spike-in
# feature_length = 1000 (non-zero int value)

adata.obs['feature_biotype'] = 'spike-in'
adata.obs['feature_length'] = 1000
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
1000            spike-in           2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.671492 with status is_valid=True
------------------
[1m[91mERROR[0m


In [86]:
# feature_biotype = gene
# feature_length = too small of a length?
# not sure if needed?

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = 1
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
1               gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.645326 with status is_valid=True
------------------
[1m[91mERROR[0m


In [87]:
# feature_biotype = gene
# feature_length = negative value

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = -1
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
-1              gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.642309 with status is_valid=True
------------------
[1m[91mERROR[0m


In [88]:
# feature_biotype = spike-in
# feature_length = negative value

adata.obs['feature_biotype'] = 'spike-in'
adata.obs['feature_length'] = -1
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
-1              spike-in           2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.667825 with status is_valid=True
------------------
[1m[91mERROR[0m


In [41]:
# feature_biotype = gene
# feature_length = float

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = 1.4
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature_biotype  2000 non-null   object 
 1   feature_length   2000 non-null   float64
dtypes: float64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
1.4             gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.644813 with status is_valid=True
------------------
[1m[91mERROR[0m


In [93]:
# feature_biotype = spike-in
# feature_length = float

adata.obs['feature_biotype'] = 'spike-in'
adata.obs['feature_length'] = 1.4
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature_biotype  2000 non-null   object 
 1   feature_length   2000 non-null   float64
dtypes: float64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
1.4             spike-in           2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.653256 with status is_valid=True
------------------
[1m[91mERROR[0m


In [92]:
# feature_biotype = gene
# feature_length = negative float
# maybe unnecessary

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = -1.4
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature_biotype  2000 non-null   object 
 1   feature_length   2000 non-null   float64
dtypes: float64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
-1.4            gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.654662 with status is_valid=True
------------------
[1m[91mERROR[0m


In [91]:
# feature_biotype = spike-in
# feature_length = negative float
# maybe unnecessary

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = -1.4
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   feature_biotype  2000 non-null   object 
 1   feature_length   2000 non-null   float64
dtypes: float64(1), object(1)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
-1.4            gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.653314 with status is_valid=True
------------------
[1m[91mERROR[0m


In [90]:
# feature_biotype = gene
# feature_length = str

adata.obs['feature_biotype'] = 'gene'
adata.obs['feature_length'] = '1000'
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   object
dtypes: object(2)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
1000            gene               2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.652828 with status is_valid=True
------------------
[1m[91mERROR[0m


In [89]:
# feature_biotype = spike-in
# feature_length = str

adata.obs['feature_biotype'] = 'spike-in'
adata.obs['feature_length'] = '0'
save_and_test(adata, 'False')

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, SM-D9D8O_S03_E1-50 to SM-D9E5W_S12_E1-50
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   feature_biotype  2000 non-null   object
 1   feature_length   2000 non-null   object
dtypes: object(2)
memory usage: 111.4+ KB
None
-------------------
feature_length  feature_biotype
0               spike-in           2000
dtype: int64
------------------
Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.651013 with status is_valid=True
------------------
[1m[91mERROR[0m
