In [None]:
import anndata as ad
import os
import pandas as pd
import scanpy as sc
from scipy import sparse
import numpy as np


#update to local directory to hold files
download_dir = '/Users/jason/Downloads/'

In [None]:
portal_obs_fields = [
    'assay',
    'cell_type',
    'development_stage',
    'disease',
    'ethnicity',
    'organism',
    'sex',
    'tissue'
]
full_obs_standards = portal_obs_fields + [e + '_ontology_term_id' for e in portal_obs_fields] + ['is_primary_data']

In [None]:
#fill in the file name
file = 'my_matrix'

In [None]:
#establish the AnnData object
adata = sc.read_h5ad(download_dir + file + '.h5ad') #add backed='r' if only looking at metadata
adata

In [None]:
#if needed, transfer to sparse matrix format
if type(adata.X) != sparse.csr.csr_matrix:
    print('converting X to sparse')
    adata.X = sparse.csr_matrix(adata.X)
if adata.raw:
    if type(adata.raw.X) != sparse.csr.csr_matrix:
        print('converting raw.X to sparse')
        raw_adata = ad.AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)
        raw_adata.X = sparse.csr_matrix(raw_adata.X)
        adata.raw = raw_adata

In [None]:
#CHECK MAX OF EACH LAYER
#should not be equal, raw should be whole, positive, 10^3
if adata.raw:
    print('raw min = ' + str(adata.raw.X.min()))
    print('raw max = ' + str(adata.raw.X.max()))
print('X min = ' + str(adata.X.min()))
print('X max = ' + str(adata.X.max()))

In [None]:
#if they are redundant, delete raw
del adata.raw

In [None]:
#check for additional layers - they may or may not be valuable
#conversions from Seurat often place the raw counts in adata.layers['counts']
adata.layers

In [None]:
#if additional layers, check min/max to look for redundancy
print('X min = ' + str(adata.layers['counts'].min()))
print('X max = ' + str(adata.layers['counts'].max()))

In [None]:
#confirm at least one set of embeddings present
adata.obsm

In [None]:
#check for default_embedding value in obsm_keys()
if 'default_embedding' in adata.uns:
    de = adata.uns['default_embedding']
    if de not in adata.obsm_keys():
        print('ERROR:' + de + ' not in ' + adata.obsm_keys())
    else:
        print(de + ' is in ' + ','.join(adata.obsm_keys()))

In [None]:
#check for uns schema fields
uns_schema =['schema_version','title','X_normalization']
for p in uns_schema:
    print(p + ': ' + adata.uns.get(p,'MISSING'))

In [None]:
#browse all of uns
adata.uns

In [None]:
#ensure the portal fields are not used
#ensure values for obs schema fields are valid
for o in full_obs_standards:
    print(o)
    if o not in adata.obs_keys():
        print('NOT IN OBS')
    else:
        un = adata.obs[o].unique()
        if un.dtype == 'category':
            print(un.to_list())
        else:
            print(un.tolist())

In [None]:
#change a portal-reserved name, if needed
adata.obs.rename(columns={'cell_type': 'author_cell_type'}, inplace=True)

In [None]:
#check for _colors fields & ensure they match obs fields
numb_types = ['int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64','float_', 'float16', 'float32', 'float64']

for k in adata.uns.keys():
    if k.endswith('_colors'):
        obs_field = k[:-(len('_colors'))]
        if obs_field not in adata.obs.keys():
            print('DELETE uns.' + k)
        else:
            if len(adata.uns[k]) != len(adata.obs[obs_field].unique()):
                print(f'uns.{k} is {str(len(adata.uns[k]))} but obs.{obs_field} is {str(len(adata.obs[obs_field].unique()))}')
            if adata.obs.dtypes[obs_field].name in numb_types:
                print(f'{obs_field} is non-categorical')

In [None]:
#check 1000 random barcodes against 10x lists
def TENx_barcode_checker(df):
    v2_file = 'cellranger-whitelist/737K-august-2016.txt'
    v3_file = '3M-february-2018.txt'
    v2_barcode_list = [line.strip() for line in open(v2_file, 'r')]
    v3_barcode_list = [line.strip() for line in open(v3_file, 'r')]

    cellcount = df.index.shape[0]
    barcode_pattern = '[ACTG]{16}'
    barcode_results = ''
    if re.search(barcode_pattern, df.index[5,]):
        cellcount
        random_indices = [randint(0, cellcount - 1) for p in range(0, 1000)]
        barcodes = {'v2': 0,'v3': 0,'both': 0,'neither': 0}
        for i in random_indices:
            if re.search(barcode_pattern, df.index[i,]):
                barcode = re.search(barcode_pattern, df.index[i,]).group(0)
                if barcode in v2_barcode_list and barcode in v3_barcode_list:
                    barcodes['both'] += 1
                elif barcode in v2_barcode_list:
                    barcodes['v2'] += 1
                elif barcode in v3_barcode_list:
                    barcodes['v3'] += 1
                else:
                    barcodes['neither'] += 1
        print(json.dumps(barcodes, indent=4))

In [None]:
#run the barcode checker to clarify 3' v2 or v3
#note: the 10x 5' v1 and v2 use the same barcode list as 3' v2
import json
import re
from random import randint


for a in adata.obs['assay_ontology_term_id'].value_counts().keys():
    print(a)
    TENx_barcode_checker(adata.obs[adata.obs['assay_ontology_term_id'] == a])
    print('---------')

In [None]:
#look for general obs field issues and collect obs information to check for redundant information
long_fields = []
gradient_fields = []
uber_dict = {}
for o in adata.obs.keys():
    vc_dict = adata.obs[o].value_counts(dropna=False).to_dict()
    counts = '_'.join([str(c) for c in vc_dict.values()])
    count_len = len(vc_dict.keys())
    values = [str(i) for i in vc_dict.keys()]

    if o.startswith(' ') or o.endswith(' ') or '  ' in o:
        print('leading/trailing whitespace:' + o)

    if o not in full_obs_standards and ' '.join(o.split()).lower() in full_obs_standards:
        print('schema conflict:' + o)

    if count_len == 1:
        lone_v = str(list(vc_dict.keys())[0])
        if o not in full_obs_standards:
            print('all same value:' + o + ',' + lone_v)

    numb_types = ['int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64','float_', 'float16', 'float32', 'float64']
    if adata.obs.dtypes[o].name in numb_types:
        gradient_fields.append(o)
    #check for long categories as they will not be enabled for coloring
    elif count_len > 200:
        long_fields.append(o)

    #report value_counts to later look for redundancy
    metadata = {
        'values': values,
        'property': o
    }
    if counts in uber_dict:
        uber_dict[counts].append(metadata)
    else:
        uber_dict[counts] = [metadata]

In [None]:
#comb value_counts to report possible redundancy
for k,v in uber_dict.items():
    if '_' in k and not k.startswith('1_1'):
        props = [e['property'] for e in v]
        if len(v) > 1 and not all(elem in full_obs_standards for elem in props):
            print('cells breakdown: ' + k)
            for e in v:
                print(e['property'])
                #print(e['values'])
            print('----------------------------------------------------------------------------')

In [None]:
#investigate any fields that may be redundant
adata.obs[['author_cell_type','cell_type_category','cell_type_ontology_term_id']].value_counts(dropna=False)

In [None]:
#check for fields that aren't appropriate as gradient (cluster number)
gradient_fields

In [None]:
#update gradient to categorical, if needed
adata.obs['cluster'] = adata.obs['cluster'].map(str)

In [None]:
#list any categorical fields with more than 200 categories as they may not be useful in the visualization
long_fields

In [None]:
#list any obs columns to remove
obs_remove = [
    'tissue',
    'organism',
    'ethnicity',
    'assay',
    'disease',
    'sex',
    'cell_type'
]

#remove the columns
obs_remove = [o for o in obs_remove if o in adata.obs.columns]
adata.obs.drop(columns=obs_remove, inplace=True)
'removed: ' + ','.join(obs_remove)

In [None]:
#review obs
adata.obs

In [None]:
#check for ensembl IDs and redundant var fields
#check for feature_biotype, feature_is_filtered
adata.var

In [None]:
#similar review for raw.var, if present
adata.raw.var

In [None]:
#if CellRanger count was used, check against the default references for matches in order to inform symbol-to-ID mapping
CR_12 = 'refdata-cellranger-GRCh38-1_2_0_genes_gtf.tsv'
CR_30 = 'refdata-cellranger-GRCh38-3_0_0_genes_gtf.tsv'
CR_2020 = 'refdata-gex-GRCh38-2020-A_genes_gtf.tsv'
CR_hg19 = 'refdata-cellranger-hg19-1_2_0_genes_gtf.tsv'
for v in [CR_12,CR_30,CR_2020,CR_hg19]:
    map_df = pd.read_csv(v, sep='\t')
    print(v)
    print(adata.var.merge(map_df,left_index=True,right_on='gene_symbols',how='inner').shape[0])
    print('----------')

In [None]:
#fill in the mapping file to use to map symbols to Ensembl IDs
#expecting a .tsv with gene_symbols column + gene_ids column
var_mapping_file = ''

In [None]:
#view what features are not mapped in this
var_map_df = pd.read_csv(var_mapping_file, sep='\t')
adata.var[adata.var.index.isin(var_map_df['gene_symbols']) != True]

In [None]:
#create the list of approved IDs to filter on
#download files from https://github.com/chanzuckerberg/single-cell-curation/tree/main/cellxgene_schema_cli/cellxgene_schema/ontology_files
ref_files = [
    'genes_ercc.csv',
    'genes_homo_sapiens.csv',
    'genes_mus_musculus.csv',
    'genes_sars_cov_2.csv'
]

if not os.path.exists('approved_ids.csv'):
    ids = pd.DataFrame()
    for f in ref_files:
        df = pd.read_csv(f, names=['feature_id','symb','num'],dtype='str')
        ids = ids.append(df)
        os.remove(f)
    ids.to_csv('approved_ids.csv',index=False)

approved = pd.read_csv('approved_ids.csv',dtype='str')['feature_id']

In [None]:
#fill in any var fields to remove (typically symbols as the portal will add those)
var_remove = [
    'gene_symbols'
]

#typically, these fields are lacking and the feature_is_filtered should be false, but confirm that is true
adata.var['feature_biotype'] = 'gene'
adata.var['feature_is_filtered'] = False

#map the Ensembl IDs
adata.var = adata.var.merge(var_map_df,left_index=True,right_on='gene_symbols',how='left').set_index(adata.var.index)

#filter out genes that don't appear in the approved annotation
var_to_keep = adata.var.index.tolist()
var_in_approved = adata.var.index[adata.var['gene_ids'].isin(approved)].tolist()
var_to_keep = [e for e in var_to_keep if e in var_in_approved]
adata = adata[:, var_to_keep]
adata.var.set_index('gene_ids',inplace=True)

#drop the necessary columns
adata.var.drop(columns=var_remove, inplace=True)
adata.var

In [None]:
#repeat much of the same steps for the raw.var, if it exists
adata.raw.var['feature_biotype'] = 'gene'

raw_adata = ad.AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)

raw_adata.var = raw_adata.var.merge(var_map_df,left_index=True,right_on='gene_symbols',how='left').set_index(raw_adata.var.index)

raw_adata = raw_adata[:, var_to_keep]
raw_adata.var.set_index('gene_ids',inplace=True)
raw_adata.var.drop(columns=var_remove, inplace=True)
adata.raw = raw_adata
adata.raw.var

In [None]:
#plot the cells to ensure they cluster by cell type
default_embedding = adata.uns.get('default_embedding',adata.obsm_keys()[0])

sc.set_figure_params(dpi=150)
sc.pl.embedding(adata, basis=default_embedding, color=['cell_type_ontology_term_id'])

In [None]:
#write the file 
new_one = file + '_revised.h5ad'
adata.write(filename=download_dir + new_one, compression='gzip')