In [None]:
from aavomics import database
import pandas
import numpy
import anndata
import os

In [None]:
ALIGNMENT_NAME = "cellranger_5.0.1_gex_mm10_2020_A"
TAXONOMY_NAMES = ["CCN202105050", "CCN202105051"]

In [None]:
cell_set_cell_type_counts = {}

for cell_set in database.CELL_SETS:
    
    print(cell_set.name)
    
    cell_set_cell_type_counts[cell_set.name] = {}
    
    adata = anndata.read_h5ad(cell_set.get_anndata_file_path(alignment_name=ALIGNMENT_NAME))
    
    for taxonomy_name in TAXONOMY_NAMES:
        
        for cell_type in adata.obs[taxonomy_name].unique():
            
            if cell_type in ["Unknown", "Multiplets"] or cell_type == "nan":
                continue
            
            cell_type_mask = (adata.obs[taxonomy_name] == cell_type) & (adata.obs["Cell Called"] == "True")
            
            cell_set_cell_type_counts[cell_set.name][cell_type] = cell_type_mask.sum()

In [None]:
if not os.path.exists("out"):
    os.makedirs("out")

pandas.DataFrame.from_dict(cell_set_cell_type_counts, orient="index").to_csv(os.path.join("out", "cell_type_distribution_by_sample.csv"))