**This notebook provides quality assurance, much of which cannot be covered by cellxgene validate, of AnnData objects towards CELLxGENE curation**

In [None]:
import anndata as ad
import json
import pandas as pd
import requests
import scanpy as sc
from IPython.display import display
from scipy import sparse
from urllib.parse import quote
from cellxgene_mods import *

# Loading the AnnData object
**Update the path of the file**<br>
*The sample `valid_mouse.h5ad` is a valid mouse test fixture that will be updated with each release*

In [None]:
file = 'validator_testing/fixtures/valid_mouse.h5ad'

**Load the AnnData object**

In [None]:
adata = sc.read_h5ad(file)
cxg_labels = True if 'schema_reference' in adata.uns else False
adata

# data

**Check if any matrix should be stored as sparse format but isn't**

In [None]:
evaluate_sparsity(adata)

**Look for duplicated layers, raw integer counts of reasonable scale (typically ~10<sup>3</sup> or more), etc.**

In [None]:
evaluate_data(adata)

**Check if any observations have exactly the same raw count vector to identify possible duplication**

In [None]:
evaluate_dup_counts(adata)

# obsm
**Confirm at least one set of embeddings is present**

In [None]:
adata.obsm

**View embeddings to identify which matches paper figures**

In [None]:
cellpop_field = 'cell_type' if cxg_labels else 'cell_type_ontology_term_id'

if f'{cellpop_field}_colors' in adata.uns:
    remove_colors = False
else:
    remove_colors = True

sc.set_figure_params(dpi=100)
for e in adata.obsm:
    if e.startswith('X_'):
        sc.pl.embedding(adata, basis=e, color=cellpop_field, legend_loc='on data')
    elif e == 'spatial':
        if np.isnan(adata.obsm['spatial']).any():
            report("obsm['spatial'] contains nans", 'ERROR')
        sc.pl.embedding(adata, basis=e, color=cellpop_field, legend_loc='on data')
    else:
        report(f'{e} will not be plotted')
if remove_colors:
    del adata.uns[f'{cellpop_field}_colors']

**Check that the default_embedding value, if defined, is in obsm**

In [None]:
if 'default_embedding' in adata.uns:
    de = adata.uns['default_embedding']
    if de not in adata.obsm_keys():
        report(f'{de} not in [{",".join(adata.obsm_keys())}]', 'ERROR')
    else:
        report(f'{de} is in [{",".join(adata.obsm_keys())}]', 'GOOD')

# uns
**Check for required uns fields**

In [None]:
for f in curator_uns_fields:
    if f in adata.uns:
        print(f'{f}: ', adata.uns[f])
    else:
        report(f'{f} is required', 'ERROR')

**Confirm portal fields are not in uns**

In [None]:
if not cxg_labels:
    for f in portal_uns_fields:
        if f in adata.uns:
            report(f'{f} should not be present in uns', 'ERROR')

**Check any _colors fields defined in uns**

In [None]:
evaluate_uns_colors(adata)

**Browse all of uns**

In [None]:
adata.uns

# obs

In [None]:
adata.obs.info()

In [None]:
adata.obs

**Ensure obs schema fields are present & precise, and no conflict with portal fields**

In [None]:
evaluate_obs_schema(adata.obs, labels=cxg_labels)

In [None]:
#temporary check for terms that will be invalid at the next CELLxGENE schema release
deprecated_ethn = {
    'HANCESTRO:0005': 'European',
    'HANCESTRO:0010': 'African',
    'HANCESTRO:0011': 'Sub-Saharan African',
    'HANCESTRO:0036': 'Middle Africa',
    'HANCESTRO:0576': 'West African',
    'HANCESTRO:0016': 'African American or Afro-Caribbean',
    'HANCESTRO:0017': 'Oceanian',
    'HANCESTRO:0310': 'Val Borbera',
    'HANCESTRO:0311': 'Friuli Venezia Giulia',
    'HANCESTRO:0312': 'Carlantino',
    'HANCESTRO:0317': 'Erasmus Rucphen',
    'HANCESTRO:0319': 'Korculan',
    'HANCESTRO:0556': 'Cilento',
    'HANCESTRO:0569': 'Talana'
}
dep_in_adata = [e for e in adata.obs['self_reported_ethnicity_ontology_term_id'].unique() if e in deprecated_ethn.keys()]
for e in dep_in_adata:
    report(f'{e} [{deprecated_ethn[e]}] will be deprecated in schema 7.0.0','WARNING')

# 10x barcode checker
**Checks the barcodes in obs index against 10x barcode lists**<br>
*5' v1 and v2 kits use the same barcode list as 3' v2*<br>
*`evaluate_10x_barcodes()` only needs to be run once*<br>
*To generate a report based on a different variable, use any obs field as the second argument in `parse_barcode_df()`*<br>
*To generate a report based on a subset of cells, create a subset dataframe from `barcode_df` and use that as the first argument in `parse_barcode_df()`*

In [None]:
assay_field = 'assay' if cxg_labels else 'assay_ontology_term_id'

barcode_df = evaluate_10x_barcodes(adata.obs)
parse_barcode_df(barcode_df, assay_field)

**Look for general obs field issues, redundant information,** \
**fields that aren't appropriate as gradient (e.g. cluster number),** \
**any categorical fields with more than 200 categories as they may not be useful in the visualization**

In [None]:
evaluate_obs(adata.obs, full_obs_standards)

**Investigate any fields that may be redundant**

In [None]:
#adata.obs[['development_stage_ontology_term_id','author_age']].value_counts(dropna=False)

**See if any donors have variable donor-level metadata** \
*disease may need to be removed from donor_fields if a donor contributed healthy and disease tissue within in study* \
*development_stage may need to be removed from donor_fields for longitudinal studies*

In [None]:
donor_fields = ['donor_id',
                'sex_ontology_term_id',
                'development_stage_ontology_term_id',
                'self_reported_ethnicity_ontology_term_id',
                'disease_ontology_term_id']
if cxg_labels:
    donor_fields = [f.replace('_ontology_term_id','') for f in donor_fields]

donor_df = pd.DataFrame(adata.obs[donor_fields].value_counts()).reset_index()
inconsistencies = donor_df[donor_df.duplicated(subset='donor_id', keep=False) == True].sort_values('donor_id')
if not inconsistencies.empty:
    report('donor metadata inconsistencies', 'ERROR')
    display(inconsistencies)

**Browse the per donor metadata**

In [None]:
donor_df

**Check human donors' reported sex**<br>
The metric (x) used to determine sex is the ratio of male to female raw expression counts summed across all genes found in sex_analysis_genes.json. 
- Donor is considered **male:** *if x > 0.35*
- Donor is considered **female:** *if x < 0.05*
- Donor is considered **unknown:** *if 0.05 < x < 0.35*

In [None]:
donor_sex_df, dp = evaluate_donors_sex(adata)

if donor_sex_df is not None:
    inconsistencies = donor_sex_df[donor_sex_df['scRNAseq_sex'] != donor_sex_df['author_annotated_sex']].sort_values('donor_id')
    if not inconsistencies.empty:
        curated_unknowns = inconsistencies[inconsistencies['author_annotated_sex'] == 'unknown']
        if not curated_unknowns.empty:
            report('donor sex metadata can be filled in', 'WARNING')
            display(curated_unknowns)

        true_inconsistencies = inconsistencies[(inconsistencies['author_annotated_sex'] == 'male') & (inconsistencies['scRNAseq_sex'] == 'female') | (inconsistencies['author_annotated_sex'] == 'female') & (inconsistencies['scRNAseq_sex'] == 'male')]
        if not true_inconsistencies.empty:
            report('donor sex metadata inconsistencies', 'ERROR')
            display(true_inconsistencies)
    else:
        report('donor sex metadata is consistent', 'GOOD')

if dp:
    dp.show()

# var
**Check for Ensembl IDs, redundant fields, etc.**<br>
**Check for schema fields**

In [None]:
adata.var

In [None]:
evaluate_var_df(adata)

**Similar checks for raw.var, if present**

In [None]:
if adata.raw:
    display(adata.raw.var)

# Validate
**Determine the embedding by which to plot**\
May need to overwrite if the first obsm is not informative

In [None]:
default_embedding = adata.uns.get('default_embedding', pick_embed(adata.obsm_keys()))
default_embedding

**Plot by multiple genes using the normalized counts**<br>
*It is best to get a list of genes relevant to the specific data from the contributor/publication*

In [None]:
symbols = [
    'CD34',
    'IGLL1',
    'TRGC2',
    'CCR9',
    'CCR7',
    'HIVEP3',
    'TOX2',
    'RAG1',
    'RAG2',
    'PCNA',
    'CDK1'
]

ensg_list = symbols_to_ids(symbols, adata.var)

In [None]:
sc.pl.embedding(adata, basis=default_embedding, color=ensg_list, use_raw=False)

**Compare with the same genes using the raw counts to confirm they are correlated**

In [None]:
if adata.raw:
    sc.pl.embedding(adata, basis=default_embedding, color=ensg_list, use_raw=True)

**Additionally, you could compare dotplots of those genes in each cell population**\
*This will scale all genes based on the max range of any gene so 1 gene with high values may make others difficult to distinguish*

In [None]:
sc.pl.dotplot(adata, ensg_list, cellpop_field, use_raw=False)

In [None]:
if adata.raw:
    sc.pl.dotplot(adata, ensg_list, cellpop_field, use_raw=True)

**If it is spatial data, test if the image and X_spatial embeddings enable scanpy use**

In [None]:
visium_assays = ['EFO:0022857','EFO:0022860','EFO:0022859']
if adata.obs['assay_ontology_term_id'].unique()[0] in visium_assays and adata.uns['spatial']['is_single'] == True:
    plot_vis(adata, cellpop_field)
    if remove_colors:
        del adata.uns[f'{cellpop_field}_colors']

**If updates have been made, write the revised file**\
*`compression='gzip'` is critical here to keep the file size down*\
**Run the CELLxGENE validator on the revised file**\
*This is the same as running `cellxgene-schema validate <file>` in the terminal*

In [None]:
# new_one = file.replace('.h5ad','_revised.h5ad')
# adata.write(filename=new_one, compression='gzip')

# valid = validate(new_one)