# Curated Atlas Query (Python)

## Importing the package

In [1]:
from curated_atlas_query_py import get_metadata, get_anndata

## Getting the metadata
The `get_metadata()` function returns a database connection and a DuckDB table.
The table can be used to query the metadata, while the connection's main purpose is to be closed when you are finished.

In [None]:
conn, table = curated_atlas_query_py.get_metadata()
table

: 

### Querying the metadata
The DuckDB table can be queried using a number of methods [described here](https://duckdb.org/docs/api/python/reference/#duckdb.DuckDBPyRelation). In particular:
* [`.filter()`](https://duckdb.org/docs/api/python/reference/#duckdb.DuckDBPyRelation.filter): filters the metadata using a string expression
* [`.fetchdf()`](https://duckdb.org/docs/api/python/reference/#duckdb.DuckDBPyRelation.fetchdf): Executes the query and returns it as a pandas DataFrame

In [8]:
table.project("ethnicity").unique().fetchdf()

ethnicity
str
"""European"""
"""Asian"""
"""African Americ..."
"""unknown"""
"""admixed ancest..."
"""Greater Middle..."
"""Hispanic or La..."
"""East Asian"""
"""African Americ..."
"""na"""


In [10]:
df.filter(
    pl.col("ethnicity") == "African"
).head().collect()

.cell,sample_id_db,.sample,.sample_name,assay,assay_ontology_term_id,file_id_db,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,ethnicity,ethnicity_ontology_term_id,file_id,is_primary_data.x,organism,organism_ontology_term_id,sample_placeholder,sex,sex_ontology_term_id,tissue,tissue_ontology_term_id,tissue_harmonised,age_days,dataset_id,collection_id,cell_count,dataset_deployments,is_primary_data.y,is_valid,linked_genesets,mean_genes_per_cell,name,published,revision,schema_version,tombstone,x_normalization,created_at.x,published_at,revised_at,updated_at.x,filename,filetype,s3_uri,user_submitted,created_at.y,updated_at.y,cell_type_harmonised,confidence_class,cell_annotation_azimuth_l2,cell_annotation_blueprint_singler,n_cell_type_in_tissue,n_tissue_in_cell_type
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,i64,str,str,i64,i64,f64,str,i64,i64,str,i64,str,f64,f64,f64,f64,str,str,str,i64,f64,f64,str,f64,str,str,i64,i64
"""AGGGAGTAGCGTTT...","""9da02eab40e49d...","""20071ec5a12650...","""Donor_03___lun...","""10x 3' v2""","""EFO:0009899""","""e593acfb8846d0...","""alveolar macro...","""CL:0000583""","""29-year-old hu...","""HsapDv:0000123...","""normal""","""PATO:0000461""","""African""","""HANCESTRO:0010...","""6661ab3a-792a-...","""TRUE""","""Homo sapiens""","""NCBITaxon:9606...",,"""female""","""PATO:0000383""","""lung parenchym...","""UBERON:0008946...","""lung""",10585.0,"""066943a2-fdac-...","""6f6d381a-7701-...",584884,"""https://cellxg...","""BOTH""",0,,1949.574488,"""The integrated...",1,2,"""2.0.0""",0,"""custom""",19062.0,19062.0,19227.0,19227.0,"""local.h5ad""","""H5AD""","""s3://corpora-d...",1,19226.0,19227.0,"""macrophage""",1.0,"""doublet""","""macrophages""",28,32
"""ATTGGACAGCCGAT...","""89ec472baa9d51...","""4fc10a6b85e5fa...","""VUHD92___lung ...","""10x 5' v1""","""EFO:0011025""","""8db91668781df0...","""pulmonary arte...","""CL:1001568""","""55-year-old hu...","""HsapDv:0000149...","""normal""","""PATO:0000461""","""African""","""HANCESTRO:0010...","""6661ab3a-792a-...","""TRUE""","""Homo sapiens""","""NCBITaxon:9606...",,"""male""","""PATO:0000384""","""lung parenchym...","""UBERON:0008946...","""lung""",20075.0,"""066943a2-fdac-...","""6f6d381a-7701-...",584884,"""https://cellxg...","""BOTH""",0,,1949.574488,"""The integrated...",1,2,"""2.0.0""",0,"""custom""",19062.0,19062.0,19227.0,19227.0,"""local.h5ad""","""H5AD""","""s3://corpora-d...",1,19226.0,19227.0,"""non_immune""",,,,28,32
"""CCCAGTTCATACCA...","""26750b2a06c447...","""055e5172053464...","""Donor_06___lun...","""10x 3' v2""","""EFO:0009899""","""e593acfb8846d0...","""alveolar macro...","""CL:0000583""","""22-year-old hu...","""HsapDv:0000116...","""normal""","""PATO:0000461""","""African""","""HANCESTRO:0010...","""6661ab3a-792a-...","""TRUE""","""Homo sapiens""","""NCBITaxon:9606...",,"""female""","""PATO:0000383""","""lung parenchym...","""UBERON:0008946...","""lung""",8030.0,"""066943a2-fdac-...","""6f6d381a-7701-...",584884,"""https://cellxg...","""BOTH""",0,,1949.574488,"""The integrated...",1,2,"""2.0.0""",0,"""custom""",19062.0,19062.0,19227.0,19227.0,"""local.h5ad""","""H5AD""","""s3://corpora-d...",1,19226.0,19227.0,"""macrophage""",1.0,"""platelet""","""macrophages""",28,32
"""TGGACGCAGTGATC...","""26750b2a06c447...","""055e5172053464...","""Donor_06___lun...","""10x 3' v2""","""EFO:0009899""","""e593acfb8846d0...","""alveolar macro...","""CL:0000583""","""22-year-old hu...","""HsapDv:0000116...","""normal""","""PATO:0000461""","""African""","""HANCESTRO:0010...","""6661ab3a-792a-...","""TRUE""","""Homo sapiens""","""NCBITaxon:9606...",,"""female""","""PATO:0000383""","""lung parenchym...","""UBERON:0008946...","""lung""",8030.0,"""066943a2-fdac-...","""6f6d381a-7701-...",584884,"""https://cellxg...","""BOTH""",0,,1949.574488,"""The integrated...",1,2,"""2.0.0""",0,"""custom""",19062.0,19062.0,19227.0,19227.0,"""local.h5ad""","""H5AD""","""s3://corpora-d...",1,19226.0,19227.0,"""macrophage""",1.0,"""doublet""","""macrophages""",28,32
"""ACGGTTACAGTCTT...","""c87e74c1cd0b6e...","""13f5331436ecae...","""NU_CZI01___lun...","""10x 3' v3""","""EFO:0009922""","""e593acfb8846d0...","""alveolar macro...","""CL:0000583""","""52-year-old hu...","""HsapDv:0000146...","""normal""","""PATO:0000461""","""African""","""HANCESTRO:0010...","""6661ab3a-792a-...","""TRUE""","""Homo sapiens""","""NCBITaxon:9606...",,"""male""","""PATO:0000384""","""lung parenchym...","""UBERON:0008946...","""lung""",18980.0,"""066943a2-fdac-...","""6f6d381a-7701-...",584884,"""https://cellxg...","""BOTH""",0,,1949.574488,"""The integrated...",1,2,"""2.0.0""",0,"""custom""",19062.0,19062.0,19227.0,19227.0,"""local.h5ad""","""H5AD""","""s3://corpora-d...",1,19226.0,19227.0,"""macrophage""",1.0,"""cd14 mono""","""macrophages""",28,32


In [4]:
q = sqlalchemy.select('*').where( \
                                 mdtab.c.ethnicity == "African", \
                                 mdtab.c.assay.like('%10x%'), \
                                 mdtab.c.tissue == "lung parenchyma", \
                                 mdtab.c.cell_type.like('%CD4%') \
                                )

In [5]:
with eng.connect() as conn:
    mddf = pd.DataFrame(conn.execute(q))
    
eng.dispose()
mddf

Unnamed: 0,.cell,sample_id_db,.sample,.sample_name,assay,assay_ontology_term_id,file_id_db,cell_type,cell_type_ontology_term_id,development_stage,...,s3_uri,user_submitted,created_at.y,updated_at.y,cell_type_harmonised,confidence_class,cell_annotation_azimuth_l2,cell_annotation_blueprint_singler,n_cell_type_in_tissue,n_tissue_in_cell_type
0,ACAGCCGGTCCGTTAA_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tem,1.0,mait,cd4 tem,28.0,31.0
1,GGGAATGAGCCCAGCT_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
2,TCTTCGGAGTAGCGGT_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
3,CCTTACGAGAGCTGCA_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
4,ATCTACTCAATGGAAT_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1566,TACTTACGTAATAGCA_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
1567,AGATAGAGTGCCTTCT_SC84,21ef23ac07391c64cadc78e16511effa,13f5331436ecaeaeffada423c8dbd1ef,NU_CZI01___lung parenchyma___52-year-old human...,10x 3' v3,EFO:0009922,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,52-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
1568,CGCGGTATCCGCGCAA_SC24,9dfbd16390b119392af9406561cb664f,055e5172053464e8efc5de1b5b3a7646,Donor_06___lung parenchyma___22-year-old human...,10x 3' v2,EFO:0009899,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,22-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
1569,TACAACGTCAGCATTG_SC84,21ef23ac07391c64cadc78e16511effa,13f5331436ecaeaeffada423c8dbd1ef,NU_CZI01___lung parenchyma___52-year-old human...,10x 3' v3,EFO:0009922,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,52-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,3.0,cd4 tcm,tregs,28.0,32.0


### Exploring the HCA contents

In [6]:
eng, mdtab = hcaquery.get_metadata()

from sqlalchemy import func

q = sqlalchemy.select(mdtab.c.tissue, mdtab.c.file_id, func.count()).distinct().group_by(mdtab.c.tissue)
with eng.connect() as conn:
    mddf = pd.DataFrame(conn.execute(q))
    
eng.dispose()
mddf

Unnamed: 0,tissue,file_id,count_1
0,adipose tissue,343f46f2-7cdd-4da8-bc7f-50a18b2c0e8e,22114
1,adrenal gland,16217568-ec4e-4391-891d-1e14c64da474,547539
2,ampulla of uterine tube,3044b5dd-a499-456e-86d9-94769bc3b63e,43247
3,anterior cingulate cortex,a91f075b-52d5-4aa3-8ecc-86c4763a49b3,7417
4,anterior part of tongue,343f46f2-7cdd-4da8-bc7f-50a18b2c0e8e,10734
...,...,...,...
160,vasculature,343f46f2-7cdd-4da8-bc7f-50a18b2c0e8e,5572
161,vault of skull,e40591e7-0e5a-4bef-9b60-7015abe5b17f,5129
162,venous blood,a84321f2-5b06-4274-8f96-e1876340600e,17625
163,vermiform appendix,e40591e7-0e5a-4bef-9b60-7015abe5b17f,4486


### Querying using raw SQL
For those who prefer writing raw SQL over SQLalchemy, you can use pandas `read_sql_query()` instead of SQLAlchemy.

In [7]:
eng, mdtab = hcaquery.get_metadata()

with eng.connect() as conn:
    query = sqlalchemy.text("SELECT * FROM metadata \
                            WHERE ethnicity='African' \
                                AND assay LIKE '%10x%' \
                                AND tissue='lung parenchyma' \
                                AND cell_type LIKE '%CD4%'")
    mddf = pd.read_sql_query(query, conn)
    
eng.dispose()
mddf

Unnamed: 0,.cell,sample_id_db,.sample,.sample_name,assay,assay_ontology_term_id,file_id_db,cell_type,cell_type_ontology_term_id,development_stage,...,s3_uri,user_submitted,created_at.y,updated_at.y,cell_type_harmonised,confidence_class,cell_annotation_azimuth_l2,cell_annotation_blueprint_singler,n_cell_type_in_tissue,n_tissue_in_cell_type
0,ACAGCCGGTCCGTTAA_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tem,1.0,mait,cd4 tem,28.0,31.0
1,GGGAATGAGCCCAGCT_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
2,TCTTCGGAGTAGCGGT_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
3,CCTTACGAGAGCTGCA_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
4,ATCTACTCAATGGAAT_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1566,TACTTACGTAATAGCA_F02526,33cdeb84ae1462d723c19af1bea2a366,4fc10a6b85e5fa688b253db4e0db8ba0,VUHD92___lung parenchyma___55-year-old human s...,10x 5' v1,EFO:0011025,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,55-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
1567,AGATAGAGTGCCTTCT_SC84,21ef23ac07391c64cadc78e16511effa,13f5331436ecaeaeffada423c8dbd1ef,NU_CZI01___lung parenchyma___52-year-old human...,10x 3' v3,EFO:0009922,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,52-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
1568,CGCGGTATCCGCGCAA_SC24,9dfbd16390b119392af9406561cb664f,055e5172053464e8efc5de1b5b3a7646,Donor_06___lung parenchyma___22-year-old human...,10x 3' v2,EFO:0009899,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,22-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,4.0,cd4 tcm,cd4 tem,28.0,32.0
1569,TACAACGTCAGCATTG_SC84,21ef23ac07391c64cadc78e16511effa,13f5331436ecaeaeffada423c8dbd1ef,NU_CZI01___lung parenchyma___52-year-old human...,10x 3' v3,EFO:0009922,bc380dae8b14313a870973697842878b,"CD4-positive, alpha-beta T cell",CL:0000624,52-year-old human stage,...,s3://corpora-data-prod/13825e35-ea32-4104-a0b7...,1,19226.0,19227.0,cd4 tcm,3.0,cd4 tcm,tregs,28.0,32.0


## Extracting Counts

Query raw counts

In [8]:
res = hcaquery.get_SingleCellExperiment(mddf, assays = ['counts'], repository='file:///vast/projects/human_cell_atlas_py/anndata')
res

Downloading file:///vast/projects/human_cell_atlas_py/anndata/original/bc380dae8b14313a870973697842878b.h5ad to /vast/scratch/users/yang.e/tmp/hca_harmonised/original/bc380dae8b14313a870973697842878b.h5ad
Reading sample files: 100%|█████████████████| 1/1 [00:00<00:00, 13315.25files/s]
Concatenating files...


AnnData object with n_obs × n_vars = 21285 × 60661

Query counts scaled per million. This is helpful if just few genes are of interest

In [9]:
res = hcaquery.get_SingleCellExperiment(mddf, assays = ['cpm'], repository='file:///vast/projects/human_cell_atlas_py/anndata')
res

Downloading file:///vast/projects/human_cell_atlas_py/anndata/cpm/bc380dae8b14313a870973697842878b.h5ad to /vast/scratch/users/yang.e/tmp/hca_harmonised/cpm/bc380dae8b14313a870973697842878b.h5ad
Reading sample files: 100%|█████████████████| 1/1 [00:00<00:00, 11155.06files/s]
Concatenating files...


AnnData object with n_obs × n_vars = 21285 × 60661

In [10]:
res=hcaquery.get_SingleCellExperiment(mddf, features = ['PUM1'], repository='file:///vast/projects/human_cell_atlas_py/anndata')
res

Reading sample files: 100%|██████████████████| 2/2 [00:00<00:00, 9927.35files/s]
Concatenating files...


AnnData object with n_obs × n_vars = 42570 × 1

In [11]:
res.obs

P2_2_TGTTCCGAGGCCCGTT-0
CTAATGGAGTGGGATC_HD67-0
GCTCCTAAGTGGACGT-1-HCATisStab7659969-0
CATGTACAGTGGGAT_GRO-09_biopsy-0
ACAGCCGGTCCGTTAA_F02526-0
...
CGGACACAGTGGAGTC_GRO-03_biopsy-1
D344_Brus_Dis1_GATGCTAAGTACGCCC-1-14-1
CTGATAGCAAATACAG-SC45-1
P2_3_TAAGTGCGTCCAACTA-1
AGCGGCACCCGATA-SC31-1
