In [None]:
# https://github.com/chanzuckerberg/single-cell-curation/issues/516
# https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md#feature_length

import numpy as np
import os
import scanpy as sc
import subprocess
import pandas as pd

In [None]:
def validate(file):
    validate_process = subprocess.run(['cellxgene-schema', 'validate', '--add-labels', file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    for line in validate_process.stdout.decode('utf-8').split('\n'):
        print(line)
    for line in validate_process.stderr.decode('utf-8').split('\n'):
        print(line)
        if 'is_valid=' in line:
            valid = line.split('=')[-1]
            return valid

In [None]:
def save_and_test(adata, expected):
    adata.write(filename='test.h5ad')
    adata = sc.read_h5ad('test.h5ad')
    if ('feature_length' in adata.var.columns) and ('feature_biotype' in adata.var.columns):
        print(adata.var[['feature_biotype','feature_length']].info())
        print('-------------------')
        print(adata.var[['feature_length','feature_biotype']].value_counts())
        
    elif 'feature_biotype' in adata.var.columns: 
        print('feature_length is absent')
    
    else: 
        print('feature_biotype is absent')

    print('------------------')
    valid = validate('test.h5ad')
    print('------------------')
    if expected != valid:
        print('\033[1m\033[91mERROR\033[0m')
    else:
        print('\033[1m\033[92mPASSED\033[0m')
    os.remove('test.h5ad')

In [None]:
adata = sc.read_h5ad('../valid.h5ad') #backed='r' would be slightly quicker but produces an error with multiple writes

In [None]:
adata.var

In [None]:
adata.raw.var

### Test info
- feature_length will be a new var field that the portal will add to each dataset upon submission: Annotator =	CELLxGENE Discover

- the --add-labels option for cellxgene-schema should be adding this too. 

- validation will be to run --add-labels to a .h5ad, then open the resulting .h5ad and do some checks:

    1. make sure var.feature_length is present

    2. if there’s a raw layer, make sure raw.var.feature_length is present

    3. then do some checks to make sure the lengths filled in are expected (we may need to pull some genes from v38 GENCODE gtf to use as checks here)

        - Lengths depend on feature_biotype = 'gene' or 'spike-in'
        - Gene lengths are calculated by creating non-overlapping concatenated exons across all isoforms of the gene, and then adding up their length in base-pairs.
        - Testing both var and raw.var

        <br> Valid cases:
        1. gene = summed length of non-overlapping concatenated exons across all isoforms of the gene
           
        2. spike-in = 0

        <br> Invalid cases:
        
        1. when feature_biotype = `gene` -> is gene length calculated as expected?
            - test when there are overlapping exons of various lengths (isoforms):
                - using gene found in v38 gencode gtf with multiple exons that vary in length
            
            - does it matter what strands exons are found on?
            
            - test null values?

            - test datatypes: str, float
            - any investigation into the use of cds vs exons?

        2. when feature_biotype = `spike-in` -> is length calculated as 0?
            - test non-zero value
            - test null value
            - test datatypes: str, float
            


In [None]:
def create_exon_df(ref_gtf):
    '''
    Input: 1) path to reference gtf annotation file
    Output: pandas dataframe of extracted exon elements
    '''

    ref = pd.read_table(ref_gtf, skiprows=[0,1,2,3,4,5], header=None)
    # extract feature information from ref_gtf
    feature_dict = {
        'ensg_id': [],
        'gene_name': [],
        'feature': [],
        'exon_id': [],
        'exon_number': [],
        'gene_type': [],
        'start': [],
        'stop': [],
        'strand': [],
        'column8': []
    }

    for index,row in ref.iterrows():
        #print(row[2])
        #print(row[8])
        if row[2] == 'exon':
            #print(row[8].split(';')[0].split('"')[1].split('.')[0])
            #print(row[8].split(';')[8])
            #print(row[8].split(';')[9].split('"')[1])
            feature_dict['ensg_id'].append(row[8].split(';')[0].split('"')[1].split('.')[0])
            feature_dict['gene_name'].append(row[8].split(';')[4].split('"')[1])
            feature_dict['feature'].append(row[2])
            feature_dict['exon_id'].append(row[8].split(';')[9].split('"')[1])
            feature_dict['exon_number'].append(row[8].split(';')[8])
            feature_dict['gene_type'].append(row[8].split(';')[2].split('"')[1])
            feature_dict['start'].append(row[3])
            feature_dict['stop'].append(row[4])
            feature_dict['strand'].append(row[6])
            feature_dict['column8'].append(row[8])
        
    
    return pd.DataFrame(feature_dict)
    

In [None]:
exon_df = create_exon_df('/Users/corinnsmall/Downloads/gencode.vM1.annotation.gtf')
exon_df

In [None]:
exon_df['exon_id'].value_counts()

In [None]:
exon_df[exon_df['exon_id'] == 'ENSMUSE00000140031.1']

In [None]:
for i in exon_df[exon_df['ensg_id'] == 'ENSMUSG00000029577']['column8']:
    print(i)

In [None]:
def calculate_expected_feature_length(ensg_id, exon_df):
    '''
    Input: 1) known ensg_id; 2) exon df of known genes
    Output: expected gene length in bps
    
    Method: Gene lengths are calculated by creating non-overlapping concatenated exons across all isoforms of the gene, and then adding up their length in base-pairs.
    '''

    

**Test valid cases**

In [None]:
# var.feature_length is present, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = calculate_expected_feature_length(ensg_id, exon_df)  # still need to determine which gene to use
save_and_test(adata, 'True')

In [None]:
# raw.var.feature_length is present
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = calculate_expected_feature_length(ensg_id, exon_df) # still need to determine which gene to use
save_and_test(adata, 'True')

In [None]:
# var.feature_length is present, for a 'spike-in'
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = 0
save_and_test(adata, 'True')

In [None]:
# var.feature_length is present, for a 'spike-in'
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = 0
save_and_test(adata, 'True')

**Test invalid cases for 'feature_biotype'='gene'**

In [None]:
# var.feature_length is not present, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var.drop(columns=['feature_length'])
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length is not present, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var.drop(columns=['feature_length'])
save_and_test(adata, 'False')

In [None]:
# var.feature_length is present and null, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = None
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length is present and null, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = None
save_and_test(adata, 'False')

In [None]:
# var.feature_length as bool, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = False
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as empty str, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = False
save_and_test(adata, 'False')

In [None]:
# var.feature_length as empty str, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = ''
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as empty str, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = ''
save_and_test(adata, 'False')

In [None]:
# var.feature_length as 'NaN' (str), for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = 'NaN'
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as 'NaN' (str), for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = 'NaN'
save_and_test(adata, 'False')

In [None]:
# var.feature_length as unexpected length, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = 0   # unexpected_length
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as unexpected length, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = 0   # unexpected_length
save_and_test(adata, 'False')

In [None]:
# var.feature_length as negative value, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = np.negative(calculate_expected_feature_length(ensg_id, exon_df))   # unexpected value for uint dtype
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as negative value, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = np.negative(calculate_expected_feature_length(ensg_id, exon_df))   # unexpected value for uint dtype
save_and_test(adata, 'False')

In [None]:
# var.feature_length as float value, for a known gene
adata.var['feature_biotype'] = 'gene'
adata.var['feature_length'] = float(calculate_expected_feature_length(ensg_id, exon_df))  # unexpected value for uint dtype
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as float value, for a known gene
adata.raw.var['feature_biotype'] = 'gene'
adata.raw.var['feature_length'] = float(calculate_expected_feature_length(ensg_id, exon_df))  # unexpected value for uint dtype
save_and_test(adata, 'False')

**Test invalid cases for 'feature_biotype'='spike-in'**

In [None]:
# var.feature_length is not present, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var.drop(columns=['feature_length'])
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length is not present, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var.drop(columns=['feature_length'])
save_and_test(adata, 'False')

In [None]:
# var.feature_length is present and null, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = None
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length is present and null, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = None
save_and_test(adata, 'False')

In [None]:
# var.feature_length as bool, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = False
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as bool, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = False
save_and_test(adata, 'False')

In [None]:
# var.feature_length as empty str, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = ''
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as empty str, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = ''
save_and_test(adata, 'False')

In [None]:
# var.feature_length as 'NaN' (str), for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = 'NaN'
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as 'NaN' (str), for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = 'NaN'
save_and_test(adata, 'False')

In [None]:
# var.feature_length as > 0, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = 1
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as > 0, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = 1
save_and_test(adata, 'False')

In [None]:
# var.feature_length as < 0, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = -1
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as < 0, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = -1
save_and_test(adata, 'False')

In [None]:
# var.feature_length as float, for spike-in
adata.var['feature_biotype'] = 'spike-in'
adata.var['feature_length'] = 0.0
save_and_test(adata, 'False')

In [None]:
# raw.var.feature_length as float, for spike-in
adata.raw.var['feature_biotype'] = 'spike-in'
adata.raw.var['feature_length'] = 0.0
save_and_test(adata, 'False')