In [1]:
import session_info

import itertools
import os
import re
import shutil
import sys
from pathlib import Path
from typing import Iterable, Literal

import anndata as ad
import numpy as np
import scipy.sparse as sp
import duckdb
import pandas as pd
import requests
from appdirs import user_cache_dir
from tqdm import tqdm

In [2]:
REMOTE_URL = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5"
ASSAY_URL = "{}/cellNexus-anndata".format(REMOTE_URL)
METADATA_URL = "{}/cellNexus-metadata/metadata.1.0.12.parquet".format(REMOTE_URL)
MIN_EXPECTED_SIZE = 5000000

assay_map = {"counts": "counts", "cpm": "cpm"}

def is_parquet_valid(parquet_file):
    try:
        conn = duckdb.connect()
        conn.from_parquet(str(parquet_file))  # Try reading
        return True  # File is valid
    except Exception as e:
        print(f"Parquet file is corrupt: {e}")
        return False  # File is corrupt
        
def _get_default_cache_dir() -> Path:
    return Path(user_cache_dir("cellNexusPy"))

    # helper function to download file over http/https
def _sync_remote_file(full_url: str, output_file: Path):
    if not output_file.exists():
        output_dir = output_file.parent
        output_dir.mkdir(parents=True, exist_ok=True)
        print(f"Downloading {full_url} to {output_file}", file=sys.stderr)
        req = requests.get(full_url, stream=True, allow_redirects=True)
        req.raise_for_status()
        pbar = tqdm(total=int(req.headers.get("Content-Length", 0)))
        with pbar.wrapattr(req.raw, "read") as src, output_file.open("wb") as dest:
            shutil.copyfileobj(src, dest)

# function to get metadata
def get_metadata(
    parquet_url: str = METADATA_URL,
    cache_dir: os.PathLike[str] = _get_default_cache_dir(),
) -> tuple[duckdb.DuckDBPyConnection, duckdb.DuckDBPyRelation]:
    parquet_local = Path(cache_dir) / parquet_url.split("/")[-1]

    if not parquet_local.exists() or not is_parquet_valid(parquet_local):
        print("File is missing or corrupted. Re-downloading...")
        parquet_local.unlink(missing_ok=True)  # Delete the corrupted file
        _sync_remote_file(parquet_url, parquet_local)  # Re-download
    
    _sync_remote_file(parquet_url, parquet_local)
    conn = duckdb.connect()
    return conn, conn.from_parquet(str(parquet_local))

def sync_assay_files(
    url: str = ASSAY_URL,
    cache_dir: Path = _get_default_cache_dir(),
    subdir: str = "",
    atlas: str = "",
    aggregation: str = "",
    files: Iterable[str] = [],
):
    for file in files:
        if aggregation == "single_cell":
            sub_url = f"{url}/{atlas}/{subdir}/{file}"
        else:
            sub_url = f"{url}/{atlas}/{aggregation}/{subdir}/{file}"
        output_filepath = cache_dir / atlas / aggregation / subdir / file

        if not output_filepath.exists() or os.path.getsize(output_filepath) < MIN_EXPECTED_SIZE:
            _sync_remote_file(sub_url, output_filepath)

        yield subdir, output_filepath

In [3]:
conn, table = get_metadata()
table

┌─────────────────────────────────────────────────────────────┬──────────────────────────────────────┬────────────────────┬─────────────────────────────────────────┬────────────────────────┬────────────────────────────┬──────────────────────────────────┬───────────┬────────────────────────┬────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────┬──────────────────────────────────────┬───────────────────┬─────────────────────────┬────────────────────────────────────┬─────────┬──────────────────────────┬──────────┬───────────────┬──────────────────────────────────────────────────────────────────────────────┬───────────────┬───────────────┬──────────┬─────────────────┬─────────────────────┬────

In [4]:
query = table.filter("""
    self_reported_ethnicity = 'African'
    AND assay LIKE '%10%'
    AND tissue = 'lung parenchyma'
    AND cell_type LIKE '%CD4%'
""")
query

┌────────────────────────────────────────────────────────────────┬──────────────────────────────────────┬────────────────────┬──────────────────────────────────────────┬─────────────────────────────────┬────────────────────────────┬──────────────────────────────────┬───────────┬────────────────────────┬────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────┬──────────────────────────────────────┬───────────────────┬─────────────────────────┬────────────────────────────────────┬─────────┬──────────────────────────┬───────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────────────────────────────────────────────────────────────────────────┬─────

In [5]:
data= query
assay = "cpm"
aggregation = "single_cell"
cache_directory = _get_default_cache_dir()
repository = ASSAY_URL
features = slice(None, None, None)

In [6]:
#assert set(assays).issubset(set(assay_map.keys()))
#assert isinstance(cache_directory, Path), "cache_directory must be a Path"

cache_directory.mkdir(exist_ok=True, parents=True)

files_to_read = (
    data.project("file_id_cellNexus_single_cell").distinct().fetchdf()["file_id_cellNexus_single_cell"]
)

atlas = data.project('"atlas_id"').distinct().fetchdf()["atlas_id"][0]                                                                                                                      

synced = sync_assay_files(
    url=repository, cache_dir=cache_directory, atlas=atlas, subdir=assay, aggregation=aggregation, files=files_to_read
)

In [8]:
for assay_name, files in itertools.groupby(synced, key=lambda x: x[0]):
    ads = [filter_data(file[1]) for file in files]

Downloading https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/cellNexus-anndata/cellxgene/03-06-2025/cpm/0a6ba1d6a93f1584aca99bfca381649f___1.h5ad to /home/juan/.cache/cellNexusPy/cellxgene/03-06-2025/single_cell/cpm/0a6ba1d6a93f1584aca99bfca381649f___1.h5ad
  0%|                                                                          | 0/2483330 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
64.0kB [00:00, 113kB/s][A
128kB [00:00, 156kB/s] [A
192kB [00:01, 178kB/s][A
448kB [00:01, 414kB/s][A
640kB [00:01, 489kB/s][A
896kB [00:02, 610kB/s][A
1.06MB [00:02, 619kB/s][A
1.31MB [00:02, 690kB/s][A
1.62MB [00:03, 809kB/s][A
1.88MB [00:03, 823kB/s][A
2.37MB [00:03, 677kB/s][A
  0%|                                                                          | 0/2483330 [00:03<?, ?it/s]
Downloading https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/cellNexus-anndata/cellxgene/03-06-2025/cpm/11095125eed0f93b0bf18634cd5ee612___1.h5ad to /h

In [7]:
def filter_data(file):
    cells = data.filter("file_id_cellNexus_single_cell ="  + "'"+str(file).split("/")[-1]+"'").fetchdf()
    anndata = ad.read_h5ad(file)
    anndata.obs.index = anndata.obs.index.astype(str)
    cell_ids = cells["cell_id"].astype(str)
    pattern = '|'.join(re.escape(s) for s in cell_ids)
    mask = anndata.obs.index.str.contains(pattern, regex=True)

    anndata = anndata[mask,features].copy()

    positions_per_cell = []
        
    for cid in cell_ids:
        pos = np.where(anndata.obs.index.str.contains(cid))[0]
        positions_per_cell.append(pos)

    ann = anndata[np.concatenate(positions_per_cell).tolist(),:].copy()
    ann.obs = cells
    ann.obs.index = ann.obs["cell_id"]

    return ann

In [9]:
adatas = ad.concat(ads,index_unique="_")

In [10]:
adatas.obs

Unnamed: 0_level_0,cell_id,dataset_id,observation_joinid,sample_id,cell_type,cell_type_ontology_term_id,sample_,assay,assay_ontology_term_id,cell_count,...,metacell_2048,metacell_4096,metacell_8192,nCount_RNA,empty_droplet,sample_chunk,cell_chunk,sample_pseudobulk_chunk,file_id_cellNexus_single_cell,file_id_cellNexus_pseudobulk
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CACATAGGTCCCTTGT_SC24___9f222629-9e39-47d0-b83f-e08d610c7479_0,CACATAGGTCCCTTGT_SC24___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,7kpoBSb|SG,270eb221dd0456cc063240404aec74cd,"CD4-positive, alpha-beta T cell",CL:0000624,270eb221dd0456cc063240404aec74cd,10x 3' v2,EFO:0009899,2282447,...,,,,3147.82345,False,1.0,7.0,2.0,0a6ba1d6a93f1584aca99bfca381649f___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
CGAATGTAGGTAGCCA_SC24___9f222629-9e39-47d0-b83f-e08d610c7479_0,CGAATGTAGGTAGCCA_SC24___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,8~c4A6~k;T,270eb221dd0456cc063240404aec74cd,"CD4-positive, alpha-beta T cell",CL:0000624,270eb221dd0456cc063240404aec74cd,10x 3' v2,EFO:0009899,2282447,...,,,,2927.82177,False,1.0,7.0,2.0,0a6ba1d6a93f1584aca99bfca381649f___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
CGTAGCGCATGGTCTA_SC24___9f222629-9e39-47d0-b83f-e08d610c7479_0,CGTAGCGCATGGTCTA_SC24___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,gpcVnDcFnR,270eb221dd0456cc063240404aec74cd,"CD4-positive, alpha-beta T cell",CL:0000624,270eb221dd0456cc063240404aec74cd,10x 3' v2,EFO:0009899,2282447,...,,,,3127.66997,False,1.0,7.0,2.0,0a6ba1d6a93f1584aca99bfca381649f___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
TTCTCAACAAGTAATG_SC24___9f222629-9e39-47d0-b83f-e08d610c7479_0,TTCTCAACAAGTAATG_SC24___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,PY$`-ko?sc,270eb221dd0456cc063240404aec74cd,"CD4-positive, alpha-beta T cell",CL:0000624,270eb221dd0456cc063240404aec74cd,10x 3' v2,EFO:0009899,2282447,...,,,,3028.49335,False,1.0,7.0,2.0,0a6ba1d6a93f1584aca99bfca381649f___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
CCATGTCTCCAAACTG_SC24___9f222629-9e39-47d0-b83f-e08d610c7479_0,CCATGTCTCCAAACTG_SC24___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,Y4tW%W3>80,270eb221dd0456cc063240404aec74cd,"CD4-positive, alpha-beta T cell",CL:0000624,270eb221dd0456cc063240404aec74cd,10x 3' v2,EFO:0009899,2282447,...,,,,3261.89737,False,1.0,7.0,2.0,0a6ba1d6a93f1584aca99bfca381649f___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LAP92_TTCAATCTCTCTATGT-1_duong___9f222629-9e39-47d0-b83f-e08d610c7479_75,LAP92_TTCAATCTCTCTATGT-1_duong___9f222629-9e39...,9f222629-9e39-47d0-b83f-e08d610c7479,28T^C{Rw@@,9c8fa5a8d2ae37179b579a0217670512___LAP92_1_duong,"CD4-positive, alpha-beta T cell",CL:0000624,9c8fa5a8d2ae37179b579a0217670512,10x 3' v3,EFO:0009922,2282447,...,,,,7592.75449,False,1.0,11.0,2.0,54df411ac4d6aa2cfd19c3f2dbf3e13c___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
TAAGAGACAACGATCT_SC07___9f222629-9e39-47d0-b83f-e08d610c7479_76,TAAGAGACAACGATCT_SC07___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,fJ!kZXa$I},c03887220681b9250f73f851d6868720,"CD4-positive, alpha-beta T cell",CL:0000624,c03887220681b9250f73f851d6868720,10x 3' v2,EFO:0009899,2282447,...,,,,3299.88625,False,1.0,10.0,2.0,f729bdf20e0ccfebede9fc5eeaceb80b___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
CAACTAGAGGTGCAAC_SC07___9f222629-9e39-47d0-b83f-e08d610c7479_76,CAACTAGAGGTGCAAC_SC07___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,ifGh1*z$V!,c03887220681b9250f73f851d6868720,"CD4-positive, alpha-beta T cell",CL:0000624,c03887220681b9250f73f851d6868720,10x 3' v2,EFO:0009899,2282447,...,,,,4198.71461,False,1.0,10.0,2.0,f729bdf20e0ccfebede9fc5eeaceb80b___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad
GCCAGTGAGGTCCCGT_SC84___9f222629-9e39-47d0-b83f-e08d610c7479_77,GCCAGTGAGGTCCCGT_SC84___9f222629-9e39-47d0-b83...,9f222629-9e39-47d0-b83f-e08d610c7479,D^{!_$cDey,4f067f7e5f960bc72b0710684a521e84____SC84,"CD4-positive, alpha-beta T cell",CL:0000624,4f067f7e5f960bc72b0710684a521e84,10x 3' v3,EFO:0009922,2282447,...,,,,4985.53721,False,1.0,14.0,2.0,0abfb48fcfcd4912e413ea49dc0ae071___1.h5ad,c25a6ea6b00d263d6cbb2d06a542a2c7___1.h5ad


In [72]:
file = "/home/juan/.cache/cellNexusPy/cellxgene/03-06-2025/single_cell/counts/1af9e4710374605bb904949487c6108e___1.h5ad"

In [73]:
cells = data.filter("file_id_cellNexus_single_cell ="  + "'"+str(file).split("/")[-1]+"'").fetchdf()
anndata = ad.read_h5ad(file)
anndata.obs.index = anndata.obs.index.astype(str)
cell_ids = cells["cell_id"].astype(str)
pattern = '|'.join(re.escape(s) for s in cell_ids)
mask = anndata.obs.index.str.contains(pattern, regex=True)
anndata = anndata[mask,features].copy()

In [76]:
anndata.obs

Unnamed: 0,observation_joinid,observation_originalid,donor_id,dataset_id,sample_id,cell_type
CAAGATCAGAGGGCTT_SC18___9f222629-9e39-47d0-b83f-e08d610c7479,CYabAXtZ)N,CAAGATCAGAGGGCTT_SC18,homosapiens_None_2023_None_sikkemalisa_002_d10...,9f222629-9e39-47d0-b83f-e08d610c7479,0000c153da22cf963b807c0563aca6a6,"CD4-positive, alpha-beta T cell"


In [77]:
cell_ids

0    CAAGATCAGAGGGCTT_SC18___9f222629-9e39-47d0-b83...
Name: cell_id, dtype: object