### Donor-level stats in cellxgene data portal

Following [sfaira tutorial](https://github.com/theislab/sfaira_tutorials/blob/master/tutorials/cellxgene_download.ipynb)

In [1]:
import sfaira
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import os
import glob


In [2]:
data_path = os.path.join("/nfs/team205/ed6/data/", "sfaira_data")
cache_path = os.path.join("/nfs/team205/ed6/data/", "sfaira_cache")
meta_path = os.path.join("/nfs/team205/ed6/data/", "sfaira_meta")
if not os.path.exists(data_path):
    os.mkdir(data_path)
if not os.path.exists(cache_path):
    os.mkdir(cache_path)
if not os.path.exists(meta_path):
    os.mkdir(meta_path)

In [3]:
sfaira.settings.cachedir_base = cache_path

## Interact w database

In [4]:
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(
    data_path=data_path, cache_path=cache_path, meta_path=meta_path, cache_metadata=True)

Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.


In [5]:
## Subset to human collections
dsg.subset(key='organism', values=['Homo sapiens']) 

## Subset to scRNA-seq protocols (exclude spatial)
protocols = ["10x 3' transcription profiling", "10x 3' v1", "10x 3' v2",
       "10x 3' v3", "10x 5' transcription profiling", "10x 5' v1",
        'CEL-seq2', 'Drop-seq',
        'Smart-seq2', 
       'microwell-seq', 'sci-RNA-seq', 'single-cell RNA sequencing']
dsg.subset(key='assay_sc', values=protocols)

## Keep collections from a known primary tissue
target_collections = []
for k,v in dsg.datasets.items():
    if v.organ is not None:
        target_collections.append(k)
# dsg.subset(key="collection_id", values=target_collections)

In [6]:
## Keep collections from published data
DOIs = []
for k in target_collections:
    DOIs.append(dsg.datasets[k].doi)

In [7]:
target_collections = [target_collections[i] for i,d in enumerate(DOIs) if d[0].startswith('https://doi.org/')]

In [8]:
dsg.subset(key="id", values=target_collections)

In [9]:
len(dsg.datasets)

213

In [17]:
# dsg.download()

In [47]:
k = '0fdb6122-4600-40f0-a703-2da47cc7080d'
v = dsg.datasets[k]

In [48]:
v.load()

ValueError: adata of 0fdb6122-4600-40f0-a703-2da47cc7080d already loaded.

In [49]:
v.streamline_features(
        remove_gene_version=True,
        match_to_release={"Homo sapiens": "104"},#, "Mus musculus": "104"},
        subset_genes_to_type="protein_coding"
    )
v.streamline_metadata(keep_orginal_obs=True, keep_id_obs=True )

KeyError: 'ensembl'

In [None]:
v.adata.obs

In [33]:
obs_to_keep = ['disease', 'sex', 'tissue', 'assay', 'ethnicity', 'development_stage']
donor_obs = v.adata.obs[obs_to_keep + [o + '_ontology_term_id' for o in obs_to_keep] + ["Patient"]].drop_duplicates()

In [34]:
donor_obs

Unnamed: 0,disease,sex,tissue,assay,ethnicity,development_stage,disease_ontology_term_id,sex_ontology_term_id,tissue_ontology_term_id,assay_ontology_term_id,ethnicity_ontology_term_id,development_stage_ontology_term_id,Patient
0,dilated cardiomyopathy,female,heart right ventricle,10x 3' v3,unknown,fifth decade human stage,MONDO:0005021,PATO:0000383,UBERON:0002080,EFO:0009922,unknown,HsapDv:0000239,DP2
166,normal,male,interventricular septum,10x 3' v3,unknown,sixth decade human stage,PATO:0000461,PATO:0000384,UBERON:0002094,EFO:0009922,unknown,HsapDv:0000240,ED_H25
187,dilated cardiomyopathy,male,heart left ventricle,10x 3' v3,unknown,seventh decade human stage,MONDO:0005021,PATO:0000384,UBERON:0002084,EFO:0009922,unknown,HsapDv:0000241,DT4
302,normal,female,heart right ventricle,10x 3' v3,unknown,sixth decade human stage,PATO:0000461,PATO:0000383,UBERON:0002080,EFO:0009922,unknown,HsapDv:0000240,ED_H15
317,dilated cardiomyopathy,female,interventricular septum,10x 3' v3,unknown,fifth decade human stage,MONDO:0005021,PATO:0000383,UBERON:0002094,EFO:0009922,unknown,HsapDv:0000239,DP2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16690,dilated cardiomyopathy,male,heart left ventricle,10x 3' v3,unknown,fifth decade human stage,MONDO:0005021,PATO:0000384,UBERON:0002084,EFO:0009922,unknown,HsapDv:0000239,IC_H01
16715,dilated cardiomyopathy,male,heart left ventricle,10x 3' v3,unknown,adolescent stage,MONDO:0005021,PATO:0000384,UBERON:0002084,EFO:0009922,unknown,HsapDv:0000086,IC_H02
16824,dilated cardiomyopathy,male,heart left ventricle,10x 3' v3,unknown,fourth decade human stage,MONDO:0005021,PATO:0000384,UBERON:0002084,EFO:0009922,unknown,HsapDv:0000238,IC_H03
16861,dilated cardiomyopathy,female,heart left ventricle,10x 3' v3,unknown,seventh decade human stage,MONDO:0005021,PATO:0000383,UBERON:0002084,EFO:0009922,unknown,HsapDv:0000241,IC_H04


In [22]:
os.listdir(v.data_dir)

['0fdb6122-4600-40f0-a703-2da47cc7080d.h5ad',
 '83b5e943-a1d5-4164-b3f2-f7a37f01b524.h5ad',
 'bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7.h5ad',
 'f7995301-7551-4e1d-8396-ffe3c9497ace.h5ad',
 'ed2b673b-0279-454a-998c-3eec361edf54.h5ad',
 '1252c5fb-945f-42d6-b1a8-8a3bd864384b.h5ad',
 '1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d.h5ad',
 '9434b020-de42-43eb-bcc4-542b2be69015.h5ad',
 '65badd7a-9262-4fd1-9ce2-eb5dc0ca8039.h5ad']

In [None]:
cellxgene_donor_obs = pd.DataFrame()
for k,v in dsg.datasets.items():
    ## Restrict to datasets with disease conditions
    if v.disease is None or v.disease != 'healthy':
        print(f"Loading {k}...")
        if v.adata is None:
            v.load()
        obs_to_keep = ['disease', 'sex', 'tissue', 'assay', 'ethnicity', 'development_stage']
        donor_obs = v.adata.obs[obs_to_keep + [o + '_ontology_term_id' for o in obs_to_keep]].drop_duplicates()
        donor_obs['dataset'] = k
        cellxgene_donor_obs = pd.concat([cellxgene_donor_obs, donor_obs])
        v.clear()

Loading edc8d3fe-153c-4e3d-8be0-2108d30f8d70...
Loading 2a498ace-872a-4935-984b-1afa70fd9886...
Loading d5c67a4e-a8d9-456d-a273-fa01adb1b308...
Loading c2a461b1-0c15-4047-9fcb-1f966fe55100...
Loading 1252c5fb-945f-42d6-b1a8-8a3bd864384b...
Loading bdf69f8d-5a96-4d6f-a9f5-9ee0e33597b7...
Loading 83b5e943-a1d5-4164-b3f2-f7a37f01b524...
Loading 9434b020-de42-43eb-bcc4-542b2be69015...
Loading ed2b673b-0279-454a-998c-3eec361edf54...
Loading f7995301-7551-4e1d-8396-ffe3c9497ace...
Loading 65badd7a-9262-4fd1-9ce2-eb5dc0ca8039...
Loading 0fdb6122-4600-40f0-a703-2da47cc7080d...
Loading 1062c0f2-2a44-4cf9-a7c8-b5ed58b4728d...
Loading 8e47ed12-c658-4252-b126-381df8d52a3d...
Loading 774c18c5-efa1-4dc5-9e5e-2c824bab2e34...
Loading 1c739a3e-c3f5-49d5-98e0-73975e751201...
Loading f15e263b-6544-46cb-a46e-e33ab7ce8347...
Loading 06b91002-4d3d-4d2e-8484-20c3b31e232c...
Loading 6c600df6-ddca-4628-a8bb-1d6de1e3f9b4...
Loading 75a881cf-5d88-46e2-bf9b-97e5cbc1bd56...
Loading 873ff933-4fda-4936-9a70-67df11af