In [None]:
import pandas as pd
import sys
from IPython.display import display
from cellxgene_mods import CxG_API,compare_revision,report

sys.path.append('..')
from public_resources.public_resources import detect_sequence_data


CxG_API.config() # set env='dev' or 'staging' if working in either of those test environments

**Specify the Collection**

In [None]:
collection_id = ''
my_dir = ''

**General QA**
- If Revsion, review updates
- Check for consortia
- Check for duplicate Dataset titles
- Review is_primary_data curation within Collection

In [None]:
collection = CxG_API.get_collection(collection_id)

#CAN WE GET THINGS DOWNLOADING IN PARALLEL WHILE WE CHECK OTHER STUFF?
#OR AT LEAST IN THE BACKGROUND?
#download to my_dir
for d in collection['datasets']:
    if d['cell_count'] > 1:
        for a in d['assets']:
            if a['filetype'] == 'H5AD':
                file = a['url'].split('/')[-1]
                print(file)
                if os.path.exists(file) == False:
                    print('Downloading')
                    with requests.get(a['url'], stream=True) as res:
                        res.raise_for_status()
                        with open(file, "wb") as df:
                            for chunk in res.iter_content(chunk_size=1024 * 1024):
                                df.write(chunk)

if collection.get('revising_in') or collection.get('revision_of'):
    collection = compare_revision(collection)
    
if not collection.get('consortia'):
    report('No consortia - confirm this is correct', 'WARNING')

df = pd.DataFrame(collection['datasets'])[['title','dataset_id','is_primary_data','cell_count','primary_cell_count']]
if not df[df.duplicated(subset='title', keep=False)].empty:
    report('non-unique Dataset titles','ERROR')
    display(df[df.duplicated(subset='title', keep=False)].sort_values('title'))

df.sort_values('is_primary_data', ascending=False)

**Validate links for presence of raw sequence data**

In [None]:
raw_data_link = False
for l in collection['links']:
    raw_formats_present = detect_sequence_data(l['link_url'])

    if raw_present != 'undetermined':
        if raw_formats_present:
            raw_data_link = True
            if l['link_type'] != 'RAW_DATA':
                report(f'raw data found at {url}, expecting link_type:RAW DATA, not {l["link_type"]}', 'ERROR')
        elif not raw_formats_present and l['link_type'] == 'RAW_DATA':
            report(f'link_type:RAW DATA but raw data not found at {l["link_url"]}', 'ERROR')

if not raw_data_link:
    report('No raw data link present', 'WARNING')

**QA Donor metadata**

In [None]:
all_obs = pd.DataFrame()
for f in os.listdir(my_dir):
    if f.endswith('.h5ad'):
        adata = sc.read_h5ad(my_dir + f, backed='r')
        adata.obs['title'] = adata.uns['title']
        all_obs = pd.concat([all_obs, adata.obs])

In [None]:
donor_fields = ['donor_id',
                'sex',
                'sex_ontology_term_id',
                'development_stage_ontology_term_id',
                'development_stage',
                'self_reported_ethnicity_ontology_term_id',
                'self_reported_ethnicity',
                'disease_ontology_term_id',
                'disease',
                'tissue_type',
                'data_source'
               ]

donor_df = pd.DataFrame(all_obs[donor_fields].value_counts()).reset_index()
inconsistencies = donor_df[donor_df.duplicated(subset='donor_id', keep=False) == True].sort_values('donor_id')
if not inconsistencies.empty:
    report('donor metadata inconsistencies', 'ERROR')
    display(inconsistencies)

In [None]:
all_cells = 'Oral and Craniofacial Atlas'
all_cells_donors = all_obs[all_obs['title'] == all_cells]['donor_id'].unique()
new_donors = [d for d in all_obs['donor_id'].unique() if d not in all_cells_donors]
new_donors

**look for consistency with other isntances of the donors across CELLxGENE**

In [None]:
datasets = CxG_API.get_datasets()

In [None]:
for dataset in datasets:
    donor_match = [d for d in donor_df['donor_id'].unique() if d in dataset['donor_id']]
    if donor_match:
        print(dataset['collection_id'], donor_match)

In [None]:
shared_donor_datasets = []
for d in cxg_datasets:
    if True in d['is_primary_data']:
        donor_match = [p for p in donor_df['donor_id'].unique() if p in d['donor_id']]
        if donor_match:
            shared_donor_datasets.append(d)

reused_obs = pd.DataFrame()
for d in shared_donor_datasets:
    with h5py.File(
        fs.open(f"corpora-data-prod/{d['dataset_version_id']}/local.h5ad")
    ) as f:
        obs = read_elem(f["obs"])
        obs['title'] = d['title']
        reused_obs = pd.concat([reused_obs, obs])
reused_obs

In [None]:
#concat_obs = pd.concat([all_obs, reused_obs])
donor_fields = ['donor_id',
                'sex',
                'development_stage',
                'self_reported_ethnicity',
                'disease']

donor_df = concat_obs[donor_fields]
donor_df['development_stage'] = donor_df['development_stage'].apply(lambda x: x.replace('human stage','stage'))
donor_df = pd.DataFrame(donor_df.value_counts()).reset_index()
inconsistencies = donor_df[donor_df.duplicated(subset='donor_id', keep=False) == True].sort_values('donor_id')
if not inconsistencies.empty:
    report('donor metadata inconsistencies', 'ERROR')
    display(inconsistencies)

In [None]:
concat_obs[concat_obs['donor_id'].isin(['284C'])][['donor_id','title','development_stage']].drop_duplicates()

**QA many spatial Datasets**

In [None]:
#visim + 4992 - is_primary_data:True - else False
#^is_single

symbols = [
    'CD34',
    'IGLL1',
    'TRGC2',
    'CCR9',
    'CCR7',
    'HIVEP3',
    'TOX2',
    'RAG1',
    'RAG2',
    'PCNA',
    'CDK1'
]

for d in collection['datasets']:
    if 'Visium' in d['assay'][0]['label'] and d['cell_count'] == 4992:
        for a in d['assets']:
            if a['filetype'] == 'H5AD':
                file = '/Users/jason/Downloads/' + a['url'].split('/')[-1]
                if os.path.exists(file) == False:
                    with requests.get(a['url'], stream=True) as res:
                        res.raise_for_status()
                        with open(file, "wb") as df:
                            for chunk in res.iter_content(chunk_size=1024 * 1024):
                                df.write(chunk)
        adata = sc.read_h5ad(file)
        print(file)
        print(adata.uns['title'])
    
        evaluate_sparsity(adata)
        evaluate_data(adata)
    
        if len(adata.var) < 15000:
            report('Less than 15k genes present','ERROR')
        elif len(adata.var) < 20000:
            report('Less than 20k genes present','WARNING')

        evaluate_dup_counts(adata)
        ensg_list = symbols_to_ids(symbols, adata.var)
        sc.pl.dotplot(adata, ensg_list, 'cell_type', use_raw=False)
        if adata.raw:
            sc.pl.dotplot(adata, ensg_list, 'cell_type', use_raw=True)
        plot_vis(adata, 'cell_type')
        print('-----------------')