**This notebook provides quality assurance, much of which cannot be covered by cellxgene validate, of AnnData objects towards CELLxGENE curation**

In [None]:
import anndata as ad
import json
import numpy as np
import os
import pandas as pd
import re
import requests
import scanpy as sc
import subprocess
from random import randint
from scipy import sparse
from urllib.parse import quote


portal_obs_fields = [
    'assay',
    'cell_type',
    'development_stage',
    'disease',
    'self_reported_ethnicity',
    'organism',
    'sex',
    'tissue'
]
curator_obs_fields = [e + '_ontology_term_id' for e in portal_obs_fields] + ['donor_id','suspension_type','is_primary_data']
full_obs_standards = portal_obs_fields + curator_obs_fields

# Loading the AnnData object
**Update the path of the file**<br>
*The sample `my_matrix.h5ad` that is in this repo is subsampled from https://cellxgene.cziscience.com/e/f15e263b-6544-46cb-a46e-e33ab7ce8347.cxg/ with some metadata alterations for the purpose of this notebook*

In [None]:
file = 'my_matrix.h5ad'

**Load the AnnData object**

In [None]:
adata = sc.read_h5ad(file)
adata

# data layers
**Check if any matrix should be stored as sparse format but isn't**

In [None]:
def determine_sparsity(x):
    if isinstance(x, sparse.coo_matrix) or isinstance(x, sparse.csr_matrix) or isinstance(x, sparse.csc_matrix):
        sparsity = 1 - x.count_nonzero() / float(np.cumprod(x.shape)[-1])
    elif isinstance(x, np.ndarray):
        sparsity = 1 - np.count_nonzero(x) / float(np.cumprod(x.shape)[-1])
    else:
        print(f'matrix is of type {type(x)}, sparsity calculation has not been implemented')

    return sparsity


max_sparsity = 0.5

sparsity = determine_sparsity(adata.X)
print(f'X sparsity: {sparsity}')
if sparsity > max_sparsity and type(adata.X) != sparse.csr_matrix:
    print('WARNING: X should be converted to sparse')

if adata.raw:
    sparsity = determine_sparsity(adata.raw.X)
    print(f'raw.X sparsity: {sparsity}')
    if sparsity > max_sparsity and type(adata.raw.X) != sparse.csr_matrix:
        print('WARNING: raw.X should be converted to sparse')

for l in adata.layers:
    sparsity = determine_sparsity(adata.layers[l])
    print(f'layers[{l}] sparsity: {sparsity}')
    if sparsity > max_sparsity and type(adata.layers[l]) != sparse.csr_matrix:
        print(f'WARNING: layers[{l}] should be converted to sparse')

**Check the min/max of each layer**<br>
*Look for duplicated or other unnecessary layers*<br>
*Raw should be whole, positive, ~10<sup>3*

In [None]:
if adata.raw:
    print('raw min = ' + str(adata.raw.X.min()))
    print('raw max = ' + str(adata.raw.X.max()))
    non_integer = np.any(~np.equal(np.mod(adata.raw.X.data, 1), 0))
else:
    non_integer = np.any(~np.equal(np.mod(adata.X.data, 1), 0))

if non_integer == False:
    print('raw is all integers')
else:
    print('ERROR: raw contains non-integer values')

print('X min = ' + str(adata.X.min()))
print('X max = ' + str(adata.X.max()))

for l in adata.layers:
    print(f'layers[{l}] min = ' + str(adata.layers[l].min()))
    print(f'layers[{l}] max = ' + str(adata.layers[l].max()))

# obsm
**Confirm at least one set of embeddings is present**

In [None]:
adata.obsm

**View embeddings to identify which matches paper figures**

In [None]:
cellpop_field = 'cell_type'

sc.set_figure_params(dpi=100)
for e in adata.obsm:
    sc.pl.embedding(adata, basis=e, color=cellpop_field, legend_loc='on data')
del adata.uns[f'{cellpop_field}_colors']

**Check that the default_embedding value, if defined, is in obsm**

In [None]:
if 'default_embedding' in adata.uns:
    de = adata.uns['default_embedding']
    if de not in adata.obsm_keys():
        print('ERROR:' + de + ' not in ' + ','.join(adata.obsm_keys()))
    else:
        print(de + ' is in ' + ','.join(adata.obsm_keys()))

# uns
**Check for uns schema fields**

In [None]:
adata.uns.get('title','ERROR: title missing')

**Confirm `schema_version` not in uns**

In [None]:
adata.uns.get('schema_version')

**Browse all of uns**

In [None]:
adata.uns

# *_colors
**scanpy & cellxgene allow for specification of cluster colors when coloring by specific obs fields**<br>
**A list of color codes is specified in `uns.PROP_colors` where `PROP` is an obs field**<br>
**The number of color codes in `uns.PROP_colors` must be at least as long as the number of unique values in `obs.PROP`**<br>
<br>
**Check for _colors fields & ensure each matches a categorical obs field**

In [None]:
numb_types = ['int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64','float_', 'float16', 'float32', 'float64']

for k in adata.uns.keys():
    if k.endswith('_colors'):
        colors = len(adata.uns[k])
        obs_field = k[:-(len('_colors'))]

        if obs_field.endswith('_ontology_term_id'):
            label_field = obs_field[:-17]
            print(f'WARNING: consider copying uns.{k} to uns.{label_field}_colors so palette transfers to CxG viz')

        if obs_field in portal_obs_fields:
            obs_field += '_ontology_term_id'
        if obs_field not in adata.obs.keys():
            print(f'WARNING: {obs_field} not found in obs, consider DELETING or RENAMING uns.{k}')
        else:
            values = len(adata.obs[obs_field].unique())
            if colors < values:
                print(f'ERROR: uns.{k} has only {str(colors)} colors but obs.{obs_field} has {str(values)} values')
            if adata.obs.dtypes[obs_field].name in numb_types:
                print(f'ERROR: uns.{k} is associated with non-categorical {obs_field}')

# obs

In [None]:
adata.obs_keys()

In [None]:
adata.obs

**Ensure schema fields are present and values are valid & precise**

In [None]:
for o in curator_obs_fields:
    print(o)
    if o not in adata.obs_keys():
        print('NOT IN OBS')
    else:
        un = adata.obs[o].unique()
        if un.dtype == 'category':
            print(un.to_list())
        else:
            print(un.tolist())

**Ensure the portal fields are not used**

In [None]:
for o in portal_obs_fields:
    print(o)
    if o not in adata.obs_keys():
        print('NOT IN OBS')
    else:
        un = adata.obs[o].unique()
        if un.dtype == 'category':
            print(un.to_list())
        else:
            print(un.tolist())

# 10x barcode checker
**Checks a random selection of the barcodes against 10x barcode lists**<br>
*Can help confirm 3' v2 vs v3 vs multiome*<br>
*5' v1 and v2 kits use the same barcode list as 3' v2*<br>
*Assumes the barcode is in the index. Suffixes/prefixes are OK*<br>
*The barcode list files are in this repo in ref_files/ (the v3 file will need to be unzipped)*<br>
<br>
**Define the function**

In [None]:
def TENx_barcode_checker(ref_df, obs_df, num_to_check):
    obs_df_sample = obs_df.sample(num_to_check, axis=0) # can add random_state=1 for reproducibility
    obs_df_split = obs_df_sample.index.str.split('([ACTG]{16})')
    barcodes = pd.DataFrame([b for l in obs_df_split for b in l if re.match(r".*[ACTG]{16}.*", b)])    
    if barcodes.empty:
        return pd.DataFrame({'summary':['no barcode'] * num_to_check})
    else:
        barcodes.rename(columns={0:'barcode'},inplace=True)
        barcodes.set_index('barcode', inplace=True)
        barcode_results = barcodes.merge(ref_df,on='barcode',how='left')
        barcode_results.fillna(0, inplace=True)
        barcode_results['summary'].replace(0, None, inplace=True)
        return barcode_results

**Define `prop` and 20% of the barcodes will be checked for each unique value in `obs.prop`**

In [None]:
prop = 'donor_id'

results = []
csv = '../ref_files/10X_barcode_table.csv'
ref_df = pd.read_csv(csv, sep=',', header=0, index_col='barcode')

for a in adata.obs[prop].unique():
    obs_df = adata.obs[adata.obs[prop] == a]
    num_to_check = obs_df.shape[0] // 5  # default is check 20% of barcodes; Note: there may be low numbers of barcodes associated with a prop, num_to_check will be 0 if floor divisor is present!
    r = TENx_barcode_checker(ref_df, obs_df, num_to_check)
    r_dict = {'3pv2_5pv1_5pv2': None, '3pv3': None, 'multiome': None,'multiple': None, 'None': None} | r['summary'].value_counts().to_dict()
    r_dict[prop] = a
    results.append(r_dict)

print(pd.DataFrame(results).set_index(prop).fillna(0).astype(int))

**Look for general obs field issues and collect obs information to check for redundant information**

In [None]:
long_fields = []
gradient_fields = []
uber_dict = {}
for o in adata.obs.keys():
    vc_dict = adata.obs[o].value_counts(dropna=False).to_dict()
    counts = '_'.join([str(c) for c in vc_dict.values()])
    count_len = len(vc_dict.keys())
    values = [str(i) for i in vc_dict.keys()]

    if o.startswith(' ') or o.endswith(' ') or '  ' in o:
        print('leading/trailing whitespace:' + o)

    if o not in full_obs_standards and ' '.join(o.split()).lower() in full_obs_standards:
        print('schema conflict:' + o)

    if count_len == 1:
        lone_v = str(list(vc_dict.keys())[0])
        if o not in full_obs_standards:
            print('all same value:' + o + ',' + lone_v)

    numb_types = ['int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64','float_', 'float16', 'float32', 'float64']
    if adata.obs.dtypes[o].name in numb_types:
        gradient_fields.append(o)
    else:
        #check for long categories as they will not be enabled for coloring
        if count_len > 200:
            long_fields.append(o)

        #report value_counts to later look for redundancy
        metadata = {
            'values': values,
            'property': o
        }
        if counts in uber_dict:
            uber_dict[counts].append(metadata)
        else:
            uber_dict[counts] = [metadata]

**Comb value_counts to report possible redundancy**

In [None]:
for k,v in uber_dict.items():
    if '_' in k and not k.startswith('1_1'):
        props = [e['property'] for e in v]
        if len(v) > 1 and not all(elem in full_obs_standards for elem in props):
            print('cells breakdown: ' + k)
            for e in v:
                print(e['property'])
                #print(e['values'])
            print('----------------------------------------------------------------------------')

**Investigate any fields that may be redundant**

In [None]:
adata.obs[['development_stage_ontology_term_id','age']].value_counts(dropna=False)

**Check for fields that aren't appropriate as gradient (e.g. cluster number)**

In [None]:
gradient_fields

**List any categorical fields with more than 200 categories as they may not be useful in the visualization**

In [None]:
long_fields

**See if any donors have variable donor-level metadata**

In [None]:
donor_fields = ['donor_id',
                'sex_ontology_term_id',
                'development_stage_ontology_term_id',
                'self_reported_ethnicity_ontology_term_id',
                'disease_ontology_term_id']

donor_df = pd.DataFrame(adata.obs[donor_fields].value_counts())
donor_df = donor_df.reset_index()
donor_df[donor_df.duplicated(subset='donor_id', keep=False) == True].sort_values('donor_id')

**Browse the per donor metadata**

In [None]:
donor_df

# var
**Check for Ensembl IDs, redundant fields, etc.**<br>
**Check for schema fields**

In [None]:
adata.var

**Similar checks for raw.var, if present**

In [None]:
adata.raw.var

# Raw counts
*Check if any observations have exactly the same raw count vector to identify possible duplication*

In [None]:
if adata.raw:
    hashes = [hash(r.tobytes()) for r in adata.raw.X.toarray()]
else:
    hashes = [hash(r.tobytes()) for r in adata.X.toarray()]

hash_df = adata.obs.copy()
hash_df['hashes'] = hashes
hash_df = hash_df[hash_df.duplicated(subset='hashes',keep=False) == True]
hash_df.sort_values('hashes', inplace=True)
hash_df

# Validate
**Determine the embedding by which to plot**\
May need to overwrite if the first obsm is not informative

In [None]:
default_embedding = adata.uns.get('default_embedding')
umap_embedding = None
tsne_embdding = None
for k in adata.obsm_keys():
    if 'umap' in k.lower():
        umap_embedding = k
    elif 'tsne' in k.lower():
        tsne_embdding = k
if not default_embedding:
    if umap_embedding:
        default_embedding = umap_embedding
    elif tsne_embdding:
        default_embedding = tsne_embdding
    else:
        default_embedding = adata.obsm_keys()[0]
default_embedding

**Plot the cells to ensure they cluster by cell type**

In [None]:
sc.set_figure_params(dpi=150)
sc.pl.embedding(adata, basis=default_embedding, color=['cell_type_ontology_term_id'])
#The above plot will set a color palette in uns, so remove that
del adata.uns['cell_type_ontology_term_id_colors']

**Plot by multiple genes using the normalized counts**<br>
*It is best to get a list of genes relevant to the specific data from the contributor/publication*

In [None]:
symbol_list = [
    'CD34',
    'IGLL1',
    'TRGC2',
    'CCR9',
    'CCR7',
    'HIVEP3',
    'TOX2',
    'RAG1',
    'RAG2',
    'PCNA',
    'CDK1'
]

ref_files = [
    'genes_ercc.csv',
    'genes_homo_sapiens.csv',
    'genes_mus_musculus.csv',
    'genes_sars_cov_2.csv'
]

ref_dir = 'ref_files/'
if not os.path.exists(ref_dir + 'genes_approved.csv'):
    ids = pd.DataFrame()
    for f in ref_files:
        df = pd.read_csv(f, names=['feature_id','symb','num','length'],dtype='str',index_col=False)
        ids = ids.append(df)
        os.remove(f)
    ids.to_csv(ref_dir + 'genes_approved.csv', index=False)

approved = pd.read_csv(ref_dir + 'genes_approved.csv',dtype='str')

ensg_list = []
for s in symbol_list:
    if s in approved['symb'].tolist():
        ensg_id = approved.loc[approved['symb'] == s, 'feature_id'].iloc[0]
        if ensg_id in adata.var.index:
            ensg_list.append(ensg_id)
            print(ensg_id + ' -- ' + s)
        else:
            s = s[0] + s[1:].lower()
            if s in approved['symb'].tolist():
                ensg_id = approved.loc[approved['symb'] == s, 'feature_id'].iloc[0]
                if ensg_id in adata.var.index:
                    ensg_list.append(ensg_id)
                    print(ensg_id + ' -- ' + s)
                else:
                    print(f'{s}/{ensg_id} not found in var')    
            else:
                print(f'{s} not found in gene file')

In [None]:
sc.pl.embedding(adata, basis=default_embedding, color=ensg_list, use_raw=False)

**Compare with the same genes using the raw counts to confirm they are correlated**

In [None]:
sc.pl.embedding(adata, basis=default_embedding, color=ensg_list, use_raw=True)

**Additionally, you could compare dotplots of those genes in each cell population**\
*This will scale all genes based on the max range of any gene so 1 gene with high values may make others difficult to distinguish*

In [None]:
sc.pl.dotplot(adata, ensg_list, 'cell_type_ontology_term_id', use_raw=False)

In [None]:
sc.pl.dotplot(adata, ensg_list, 'cell_type_ontology_term_id', use_raw=True)

**If it is spatial data, test if the image and X_spatial embeddings enable scanpy use**

In [None]:
sc.pl.spatial(adata, color='cell_type_ontology_term_id')

**If updates have been made, write the revised file**\
*`compression='gzip'` is critical here to keep the file size down*

In [None]:
new_one = file.replace('.h5ad','_revised.h5ad')
adata.write(filename=new_one, compression='gzip')
new_one

**Run the CELLxGENE validator on the revised file**<br>
*This is the same as running `cellxgene-schema validate <file>` in the terminal*

In [None]:
validate_process = subprocess.run(['cellxgene-schema', 'validate', new_one], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
for line in validate_process.stdout.decode('utf-8').split('\n'):
    print(line)
for line in validate_process.stderr.decode('utf-8').split('\n'):
    print(line)