In [None]:
# https://github.com/chanzuckerberg/single-cell-curation/issues/516
# https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#feature_length

import numpy as np
import os
import scanpy as sc
import subprocess
import pandas as pd

In [None]:
def save(adata, h5ad_file_name): 
    ''' 
    Input: valid adata
    Output: valid h5ad and checks adata.raw.var and adata.var for relevant fields
    
    '''

    adata.write(filename=h5ad_file_name)
    adata = sc.read_h5ad(h5ad_file_name) 
    print(adata)
    
    try:
        adata.raw
        print('adata.raw present')
        # check adata.raw.var
        if ('feature_length' not in adata.raw.var.columns) and ('feature_biotype' not in adata.raw.var.columns):
            print('Raw.var: feature_length and feature_biotype are absent - good CELLxGENE should annotate these two var fields')
    
        elif ('feature_biotype' in adata.raw.var.columns) and ('feature_length' not in adata.raw.var.columns):
            print('Raw.var: feature_biotype is present')
            #print(adata.raw.var['feature_biotype'].info())
            print('-------------------')
    
        else: 
            print(adata.raw.var['feature_length'].info())
    except:
        print('adata.raw not present')

    try:
        adata.var
        # check adata.var
        if ('feature_length' not in adata.var.columns) and ('feature_biotype' not in adata.var.columns):
            print('Var: feature_length and feature_biotype are absent - good CELLxGENE should annotate these two var fields')

        elif ('feature_biotype' in adata.var.columns) and ('feature_length' not in adata.var.columns):
            print('Var: feature_biotype is present')
            #print(adata.var['feature_biotype'].info())
            print('-------------------')

        else: 
            print(adata.var['feature_length'].info())
    
    except:
        print('adata.var not present')

In [None]:
def concat_expected_lengths(directory, list_of_csvs):
    df_list = []
    for c in list_of_csvs:
        df = pd.read_csv(directory + c,header=None)
        df_list.append(df)
    
    return pd.concat(df_list,ignore_index=True)

In [None]:
def validate(input_file, output_file):
    ''' 
    Input: h5ad file
    Output: h5ad file with additional var metadata fields
    
    '''
    validate_process = subprocess.run(['cellxgene-schema', 'validate', '--add-labels', f'{output_file}', f'{input_file}'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            print(valid)

In [None]:
# read in gene csvs with known feature lengths
list_of_gene_csvs = ['genes_homo_sapiens.csv.gz', 'genes_mus_musculus.csv.gz', 'genes_sars_cov_2.csv.gz', 'genes_ercc.csv.gz']
expected_df = concat_expected_lengths('/Users/corinnsmall/GitClones/CZI/single-cell-curation/cellxgene_schema_cli/cellxgene_schema/ontology_files/', list_of_gene_csvs)

In [None]:
# read in valid h5ad (modify this to include all human, mouse (already included), sars-cov, and spike-in feature ID)
adata = sc.read_h5ad('../valid.h5ad')
adata.var.reset_index(inplace=True)
adata.raw.var.reset_index(inplace=True)

In [None]:
# create new list of feature_ids sampled from concatenated list of genes
sample = pd.DataFrame(expected_df.sample(22356)[0])
sample = sample.reset_index()
sample.drop(columns='index', inplace=True)

In [None]:
# check how many of each organism was sampled
print('Mouse sample count: ',sample.loc[sample[0].str.contains(r'^ENSMUSG'), :].shape[0])
print('Human sample count: ',sample.loc[sample[0].str.contains(r'^ENSG'), :].shape[0])
print('Covid sample count: ',sample.loc[sample[0].str.contains(r'^ENSSASG'), :].shape[0])
print('Spike-in sample count: ', sample.loc[sample[0].str.contains(r'^ERCC'), :].shape[0])

In [None]:
# replace adata.var and adata.raw.var with new sampling of feature ids
adata.var['feature_id'] = sample
adata.var.set_index('feature_id', inplace=True)
adata.raw.var['feature_id'] = sample
adata.raw.var.set_index('feature_id', inplace=True)

In [None]:
# save new adata as h5ad
save(adata, 'new_valid.h5ad')

In [None]:
# validate new_valid.h5ad
validate('new_valid.h5ad','output_new_valid.h5ad')

In [None]:
# check output_new_valid.h5ad for var.feature_length & raw.var.feature_length
adata_with_feature_length = sc.read_h5ad('output_new_valid.h5ad')

try:
    adata_with_feature_length.var['feature_length']
except:
    print("'feature_length' is not present in var")

try:
    adata_with_feature_length.raw.var['feature_length']
except:
    print("'feature_length' is not present in raw.var")

In [None]:
# prep expected_df for merge
expected_df['feature_id'] = expected_df[0]
expected_df.set_index('feature_id', inplace=True)
expected_df.drop(columns=0, inplace=True)
expected_df.rename(columns={1:'gene_name', 2:'gene_version', 3:'gene_length'}, inplace=True)

In [None]:
# merge vars with the concatenated expected feature lengths df
merged_expected_and_calc_var = pd.merge(adata_with_feature_length.var, expected_df, how = 'left', left_index=True,right_on='feature_id')
merged_expected_and_calc_raw_var = pd.merge(adata_with_feature_length.raw.var, expected_df, how = 'left', left_index=True,right_on='feature_id')

In [None]:
# check if validator --add-labels calculated var feature_length correctly
for c,r in merged_expected_and_calc_var.iterrows():
    if r['feature_biotype'] == 'gene':
        # compare feature_length and column 3
        if r['feature_length'] == r['gene_length']:
            print('var feature_length is correct')
            print('\033[1m\033[92mPASSED\033[0m')
        else:
            print('var feature_length is wrong')
            print('\033[1m\033[91mERROR\033[0m')

    elif r['feature_biotype'] != 'gene' and (r['feature_biotype'] == 'spike-in'):
        if r['feature_length'] == 0:
            print('var feature_length is correct')
            print('\033[1m\033[92mPASSED\033[0m')
        else:
            print('var feature_length is wrong')
            print('\033[1m\033[91mERROR\033[0m')

In [None]:
# check if validator --add-labels calculated raw.var feature_length correctly
for c,r in merged_expected_and_calc_raw_var.iterrows():
    if r['feature_biotype'] == 'gene':
        if r['feature_length'] == r['gene_length']:
            print('raw.var feature_length is correct')
            print('\033[1m\033[92mPASSED\033[0m')
        else:
            print('raw.var feature_length is wrong')
            print('\033[1m\033[91mERROR\033[0m')

    elif (r['feature_biotype'] != 'gene') and (r['feature_biotype'] == 'spike-in'):
        if r['feature_length'] == 0:
            print('raw.var feature_length is correct')
            print('\033[1m\033[92mPASSED\033[0m')
        else:
            print('raw.var feature_length is wrong')
            print('\033[1m\033[91mERROR\033[0m')