In [2]:
import session_info

import itertools
import os
import re
import shutil
import sys
from pathlib import Path
from typing import Iterable, Literal

import anndata as ad
import numpy as np
import scipy.sparse as sp
import duckdb
import pandas as pd
import requests
from appdirs import user_cache_dir
from tqdm import tqdm

In [3]:
REMOTE_URL = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5"
ASSAY_URL = "{}/cellNexus-anndata".format(REMOTE_URL)
METADATA_URL = "{}/cellNexus-metadata/metadata.1.0.12.parquet".format(REMOTE_URL)
MIN_EXPECTED_SIZE = 5000000

assay_map = {"counts": "counts", "cpm": "cpm"}

def is_parquet_valid(parquet_file):
    try:
        conn = duckdb.connect()
        conn.from_parquet(str(parquet_file))  # Try reading
        return True  # File is valid
    except Exception as e:
        print(f"Parquet file is corrupt: {e}")
        return False  # File is corrupt
        
def _get_default_cache_dir() -> Path:
    return Path(user_cache_dir("cellNexusPy"))

    # helper function to download file over http/https
def _sync_remote_file(full_url: str, output_file: Path):
    if not output_file.exists():
        output_dir = output_file.parent
        output_dir.mkdir(parents=True, exist_ok=True)
        print(f"Downloading {full_url} to {output_file}", file=sys.stderr)
        req = requests.get(full_url, stream=True, allow_redirects=True)
        req.raise_for_status()
        pbar = tqdm(total=int(req.headers.get("Content-Length", 0)))
        with pbar.wrapattr(req.raw, "read") as src, output_file.open("wb") as dest:
            shutil.copyfileobj(src, dest)

# function to get metadata
def get_metadata(
    parquet_url: str = METADATA_URL,
    cache_dir: os.PathLike[str] = _get_default_cache_dir(),
) -> tuple[duckdb.DuckDBPyConnection, duckdb.DuckDBPyRelation]:
    parquet_local = Path(cache_dir) / parquet_url.split("/")[-1]

    if not parquet_local.exists() or not is_parquet_valid(parquet_local):
        print("File is missing or corrupted. Re-downloading...")
        parquet_local.unlink(missing_ok=True)  # Delete the corrupted file
        _sync_remote_file(parquet_url, parquet_local)  # Re-download
    
    _sync_remote_file(parquet_url, parquet_local)
    conn = duckdb.connect()
    return conn, conn.from_parquet(str(parquet_local))

def sync_assay_files(
    url: str = ASSAY_URL,
    cache_dir: Path = _get_default_cache_dir(),
    subdir: str = "",
    atlas: str = "",
    aggregation: str = "",
    files: Iterable[str] = [],
):
    for file in files:
        sub_url = f"{url}/{atlas}/{aggregation}/{subdir}/{file}"
        output_filepath = cache_dir / atlas / aggregation / subdir / file

        if not output_filepath.exists() or os.path.getsize(output_filepath) < MIN_EXPECTED_SIZE:
            _sync_remote_file(sub_url, output_filepath)

        yield subdir, output_filepath

# METACELL

In [4]:
conn, table = get_metadata()
table

┌─────────────────────────────────────────────────────────────┬──────────────────────────────────────┬────────────────────┬─────────────────────────────────────────┬────────────────────────┬────────────────────────────┬──────────────────────────────────┬───────────┬────────────────────────┬────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────┬──────────────────────────────────────┬───────────────────┬─────────────────────────┬────────────────────────────────────┬─────────┬──────────────────────────┬──────────┬───────────────┬──────────────────────────────────────────────────────────────────────────────┬───────────────┬───────────────┬──────────┬─────────────────┬─────────────────────┬────

In [5]:
query = table.filter("""
    self_reported_ethnicity = 'African'
    AND assay LIKE '%10%'
    AND tissue = 'lung parenchyma'
    AND cell_type LIKE '%CD4%'
""")
query

┌────────────────────────────────────────────────────────────────┬──────────────────────────────────────┬────────────────────┬──────────────────────────────────────────┬─────────────────────────────────┬────────────────────────────┬──────────────────────────────────┬───────────┬────────────────────────┬────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────┬──────────────────────────────────────┬───────────────────┬─────────────────────────┬────────────────────────────────────┬─────────┬──────────────────────────┬───────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────────────────────────────────────────────────────────────────────────┬─────

In [53]:
data= query
assay = "counts"
aggregation = "metacell_2"
cache_directory = _get_default_cache_dir()
repository = ASSAY_URL
features = slice(None, None, None)

In [221]:
#assert set(assays).issubset(set(assay_map.keys()))
#assert isinstance(cache_directory, Path), "cache_directory must be a Path"

cache_directory.mkdir(exist_ok=True, parents=True)

data = data.filter(aggregation + " IS NOT NULL")

files_to_read = (
    data.project("file_id_cellNexus_single_cell").distinct().fetchdf()["file_id_cellNexus_single_cell"]
)
#cells = data.project('"cell_id"').distinct().fetchdf()["cell_id"]
atlas = data.project('"atlas_id"').distinct().fetchdf()["atlas_id"][0]                                                                                                                      

synced = sync_assay_files(
    url=repository, cache_dir=cache_directory, atlas=atlas, subdir=assay, aggregation=aggregation, files=files_to_read
)

In [222]:
for _,files in itertools.groupby(synced, key=lambda x: x[0]):
    ads = [filter_data(file[1]) for file in files]

In [223]:
adatas = ad.concat(ads,index_unique="_")

In [224]:
adatas

AnnData object with n_obs × n_vars = 703 × 56239
    obs: 'dataset_id', 'sample_id', 'assay', 'assay_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'experiment___', 'explorer_url', 'feature_count', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'published_at', 'raw_data_location', 'revised_at', 'sample_heuristic', 'schema_version', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'title', 'tombstone', 'url', 'age_days', 'tissue_groups', 'atlas_id', 'sample_chunk', 'file_id_cellNexus_single_cell', 'file_id_cellNexus_metacell'

In [220]:
def filter_data(file):
    df = data.filter("file_id_cellNexus_single_cell ="  + "'"+str(file).split("/")[-1]+"'").fetchdf()
    df["file_id_cellNexus_metacell"] = df["sample_id"].astype(str) + "___" + df["metacell_2"].astype(int).astype(str)
    df.index = df["file_id_cellNexus_metacell"]
    filt_ad = ad.read_h5ad(file)[df["file_id_cellNexus_metacell"].unique()]
    filt_ad.obs = df[["dataset_id", "sample_id", "assay", "assay_ontology_term_id", 
     "development_stage", "development_stage_ontology_term_id", "disease", "disease_ontology_term_id", 
     "donor_id", "experiment___", "explorer_url", "feature_count", "is_primary_data", 
     "organism", "organism_ontology_term_id", "published_at", "raw_data_location", 
     "revised_at", "sample_heuristic", "schema_version", "self_reported_ethnicity", 
     "self_reported_ethnicity_ontology_term_id", "sex", "sex_ontology_term_id", "tissue", 
     "tissue_ontology_term_id", "tissue_type", "title", "tombstone", "url", "age_days", 
     "tissue_groups", "atlas_id", "sample_chunk", "file_id_cellNexus_single_cell","file_id_cellNexus_metacell"]].drop_duplicates()
    return filt_ad

In [225]:
adatas.obs

Unnamed: 0_level_0,dataset_id,sample_id,assay,assay_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,donor_id,experiment___,...,tissue_type,title,tombstone,url,age_days,tissue_groups,atlas_id,sample_chunk,file_id_cellNexus_single_cell,file_id_cellNexus_metacell
file_id_cellNexus_metacell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
270eb221dd0456cc063240404aec74cd___8_0,9f222629-9e39-47d0-b83f-e08d610c7479,270eb221dd0456cc063240404aec74cd,10x 3' v2,EFO:0009899,22-year-old human stage,HsapDv:0000116,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,8030,respiratory system,cellxgene/03-06-2025,1.0,8ff41aeab65f0de9a6954ea3bfb5707c___1.h5ad,270eb221dd0456cc063240404aec74cd___8
4f067f7e5f960bc72b0710684a521e84____SC84___895_1,9f222629-9e39-47d0-b83f-e08d610c7479,4f067f7e5f960bc72b0710684a521e84____SC84,10x 3' v3,EFO:0009922,52-year-old human stage,HsapDv:0000146,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,18980,respiratory system,cellxgene/03-06-2025,1.0,46cf7e97c5438f1b59b0a7e54be2ad22___1.h5ad,4f067f7e5f960bc72b0710684a521e84____SC84___895
4f067f7e5f960bc72b0710684a521e84____SC84___162_1,9f222629-9e39-47d0-b83f-e08d610c7479,4f067f7e5f960bc72b0710684a521e84____SC84,10x 3' v3,EFO:0009922,52-year-old human stage,HsapDv:0000146,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,18980,respiratory system,cellxgene/03-06-2025,1.0,46cf7e97c5438f1b59b0a7e54be2ad22___1.h5ad,4f067f7e5f960bc72b0710684a521e84____SC84___162
4f067f7e5f960bc72b0710684a521e84____SC84___942_1,9f222629-9e39-47d0-b83f-e08d610c7479,4f067f7e5f960bc72b0710684a521e84____SC84,10x 3' v3,EFO:0009922,52-year-old human stage,HsapDv:0000146,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,18980,respiratory system,cellxgene/03-06-2025,1.0,46cf7e97c5438f1b59b0a7e54be2ad22___1.h5ad,4f067f7e5f960bc72b0710684a521e84____SC84___942
4f067f7e5f960bc72b0710684a521e84____SC84___421_1,9f222629-9e39-47d0-b83f-e08d610c7479,4f067f7e5f960bc72b0710684a521e84____SC84,10x 3' v3,EFO:0009922,52-year-old human stage,HsapDv:0000146,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,18980,respiratory system,cellxgene/03-06-2025,1.0,46cf7e97c5438f1b59b0a7e54be2ad22___1.h5ad,4f067f7e5f960bc72b0710684a521e84____SC84___421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
d0a8856647d20b1fa1e83edb4bb9313e___20_20,9f222629-9e39-47d0-b83f-e08d610c7479,d0a8856647d20b1fa1e83edb4bb9313e,10x 5' v1,EFO:0011025,55-year-old human stage,HsapDv:0000149,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,20075,respiratory system,cellxgene/03-06-2025,1.0,814feb36368a5d0258a2d62a0f1d4a7a___1.h5ad,d0a8856647d20b1fa1e83edb4bb9313e___20
d0a8856647d20b1fa1e83edb4bb9313e___21_20,9f222629-9e39-47d0-b83f-e08d610c7479,d0a8856647d20b1fa1e83edb4bb9313e,10x 5' v1,EFO:0011025,55-year-old human stage,HsapDv:0000149,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,20075,respiratory system,cellxgene/03-06-2025,1.0,814feb36368a5d0258a2d62a0f1d4a7a___1.h5ad,d0a8856647d20b1fa1e83edb4bb9313e___21
d0a8856647d20b1fa1e83edb4bb9313e___32_20,9f222629-9e39-47d0-b83f-e08d610c7479,d0a8856647d20b1fa1e83edb4bb9313e,10x 5' v1,EFO:0011025,55-year-old human stage,HsapDv:0000149,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,20075,respiratory system,cellxgene/03-06-2025,1.0,814feb36368a5d0258a2d62a0f1d4a7a___1.h5ad,d0a8856647d20b1fa1e83edb4bb9313e___32
d0a8856647d20b1fa1e83edb4bb9313e___12_20,9f222629-9e39-47d0-b83f-e08d610c7479,d0a8856647d20b1fa1e83edb4bb9313e,10x 5' v1,EFO:0011025,55-year-old human stage,HsapDv:0000149,normal,PATO:0000461,homosapiens_None_2023_None_sikkemalisa_002_d10...,,...,tissue,An integrated cell atlas of the human lung in ...,False,https://datasets.cellxgene.cziscience.com/8d84...,20075,respiratory system,cellxgene/03-06-2025,1.0,814feb36368a5d0258a2d62a0f1d4a7a___1.h5ad,d0a8856647d20b1fa1e83edb4bb9313e___12
