In [42]:
# https://github.com/chanzuckerberg/single-cell-curation/issues/516
# https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#feature_length

import numpy as np
import os
import scanpy as sc
import subprocess
import pandas as pd

In [43]:
def save(adata, h5ad_file_name): 
    ''' 
    Input: valid adata
    Output: valid h5ad and checks adata.raw.var and adata.var for relevant fields
    
    '''

    adata.write(filename=h5ad_file_name)
    adata = sc.read_h5ad(h5ad_file_name) 
    print(adata)
    
    try:
        adata.raw
        print('adata.raw present')
        # check adata.raw.var
        if ('feature_length' not in adata.raw.var.columns) and ('feature_biotype' not in adata.raw.var.columns):
            print('Raw.var: feature_length and feature_biotype are absent - good CELLxGENE should annotate these two var fields')
    
        elif ('feature_biotype' in adata.raw.var.columns) and ('feature_length' not in adata.raw.var.columns):
            print('Raw.var: feature_biotype is present')
            #print(adata.raw.var['feature_biotype'].info())
            print('-------------------')
    
        else: 
            print(adata.raw.var['feature_length'].info())
    except:
        print('adata.raw not present')

    try:
        adata.var
        # check adata.var
        if ('feature_length' not in adata.var.columns) and ('feature_biotype' not in adata.var.columns):
            print('Var: feature_length and feature_biotype are absent - good CELLxGENE should annotate these two var fields')

        elif ('feature_biotype' in adata.var.columns) and ('feature_length' not in adata.var.columns):
            print('Var: feature_biotype is present')
            #print(adata.var['feature_biotype'].info())
            print('-------------------')

        else: 
            print(adata.var['feature_length'].info())
    
    except:
        print('adata.var not present')

In [44]:
def concat_expected_lengths(directory, list_of_csvs):
    df_list = []
    for c in list_of_csvs:
        df = pd.read_csv(directory + c,header=None)
        df_list.append(df)
    
    return pd.concat(df_list,ignore_index=True)

In [45]:
def validate(input_file, output_file):
    ''' 
    Input: h5ad file
    Output: h5ad file with additional var metadata fields
    
    '''
    validate_process = subprocess.run(['cellxgene-schema', 'validate', '--add-labels', f'{output_file}', f'{input_file}'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            print(valid)

In [46]:
# read in gene csvs with known feature lengths
list_of_gene_csvs = ['genes_homo_sapiens.csv.gz', 'genes_mus_musculus.csv.gz', 'genes_sars_cov_2.csv.gz', 'genes_ercc.csv.gz']
expected_df = concat_expected_lengths('~/GitClones/CZI/single-cell-curation/cellxgene_schema_cli/cellxgene_schema/ontology_files/', list_of_gene_csvs)

expected_df.rename(columns={0:'feature_id', 1:'gene_name', 2:'gene_version', 3:'gene_length'}, inplace=True)
expected_df.loc[expected_df['feature_id'].str.startswith('ERCC'), 'gene_length'] = 0

In [47]:
# read in valid h5ad (modify this to include all human, mouse (already included), sars-cov, and spike-in feature ID)
adata = sc.read_h5ad('../valid.h5ad')
adata.var.reset_index(inplace=True)
adata.raw.var.reset_index(inplace=True)

In [49]:
# create new list of feature_ids sampled from concatenated list of genes
sample = pd.DataFrame(expected_df.sample(22356)['feature_id'])
sample = sample.reset_index()
sample.drop(columns='index', inplace=True)

In [51]:
# check how many of each organism was sampled
print('Mouse sample count: ',sample.loc[sample['feature_id'].str.contains(r'^ENSMUSG'), :].shape[0])
print('Human sample count: ',sample.loc[sample['feature_id'].str.contains(r'^ENSG'), :].shape[0])
print('Covid sample count: ',sample.loc[sample['feature_id'].str.contains(r'^ENSSASG'), :].shape[0])
print('Spike-in sample count: ', sample.loc[sample['feature_id'].str.contains(r'^ERCC'), :].shape[0])

Mouse sample count:  10749
Human sample count:  11586
Covid sample count:  1
Spike-in sample count:  20


In [52]:
# replace adata.var and adata.raw.var with new sampling of feature ids
adata.var['feature_id'] = sample
adata.var.set_index('feature_id', inplace=True)
adata.raw.var['feature_id'] = sample
adata.raw.var.set_index('feature_id', inplace=True)

In [53]:
# save new adata as h5ad
save(adata, 'new_valid.h5ad')

AnnData object with n_obs × n_vars = 2000 × 22356
    obs: 'BICCN_cluster_id', 'QC', 'BICCN_cluster_label', 'BICCN_subclass_label', 'BICCN_class_label', 'cluster_color', 'size', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'BICCN_project', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'tissue_type'
    var: 'Unnamed: 0', 'feature_is_filtered'
    uns: 'title'
    obsm: 'X_harmony', 'X_umap'
adata.raw present
Raw.var: feature_length and feature_biotype are absent - good CELLxGENE should annotate these two var fields
Var: feature_length and feature_biotype are absent - good CELLxGENE should annotate these two var fields


In [54]:
# validate new_valid.h5ad
validate('new_valid.h5ad','output_new_valid.h5ad')

Loading dependencies
Loading validator modules

Starting validation...
Validation complete in 0:00:00.911992 with status is_valid=True
True
Writing labels
enforce canonical format in X
enforce canonical format in raw.X
H5AD label writing complete in 0:03:13.952881, was_writing_successful: True



In [59]:
# check output_new_valid.h5ad for var.feature_length & raw.var.feature_length
adata_with_feature_length = sc.read_h5ad('output_new_valid.h5ad')

try:
    adata_with_feature_length.var['feature_length']
    print('\033[1m\033[92mPASSED\033[0m')
except:
    print('\033[1m\033[91mERROR\033[0m')
    print("'feature_length' is not present in var")

try:
    adata_with_feature_length.raw.var['feature_length']
    print('\033[1m\033[92mPASSED\033[0m')
except:
    print('\033[1m\033[91mERROR\033[0m')
    print("'feature_length' is not present in raw.var")

[1m[92mPASSED[0m
[1m[92mPASSED[0m


In [56]:
# merge vars with the concatenated expected feature lengths df
merged_expected_and_calc_var = pd.merge(adata_with_feature_length.var, expected_df, how = 'left', left_index=True,right_on='feature_id')
merged_expected_and_calc_raw_var = pd.merge(adata_with_feature_length.raw.var, expected_df, how = 'left', left_index=True,right_on='feature_id')

In [57]:
var_mismatch = merged_expected_and_calc_var[
    merged_expected_and_calc_var['feature_length'] != merged_expected_and_calc_var['gene_length']
    ].shape[0]

var_match = merged_expected_and_calc_var[
    merged_expected_and_calc_var['feature_length'] == merged_expected_and_calc_var['gene_length']
    ].shape[0]

if var_mismatch > 0:
    print('\033[1m\033[91mERROR\033[0m')
    print(f'{str(var_mismatch)} features have unexpected lengths')
else:
    print('\033[1m\033[92mPASSED\033[0m')
    print(f'All {str(var_match)} features have expected lengths')

[1m[92mPASSED[0m
All 22356 features have expected lengths


In [58]:
raw_var_mismatch = merged_expected_and_calc_raw_var[
    merged_expected_and_calc_raw_var['feature_length'] != merged_expected_and_calc_raw_var['gene_length']
    ].shape[0]

raw_var_match = merged_expected_and_calc_raw_var[
    merged_expected_and_calc_raw_var['feature_length'] == merged_expected_and_calc_raw_var['gene_length']
    ].shape[0]

if raw_var_mismatch > 0:
    print('\033[1m\033[91mERROR\033[0m')
    print(f'{str(raw_var_mismatch)} features have unexpected lengths')
else:
    print('\033[1m\033[92mPASSED\033[0m')
    print(f'All {str(raw_var_match)} features have expected lengths')

[1m[92mPASSED[0m
All 22356 features have expected lengths
