In [1]:
from aavomics import database
import pandas
import anndata
import os
import numpy

In [2]:
# Which alignment to use. Set to None to use the first available
ALIGNMENT_NAME = "cellranger_5.0.1_gex_mm10_2020_A"
VIRUS_ALIGNMENT_NAME = "cellranger_5.0.1_gex_mm10_2020_A_AAVomics"

DROPLET_TAXONOMY_NAME = "CCN202105041"
CELL_TYPE_TAXONOMY_NAME = "CCN202105060"


AMPLIFIED_CELL_SETS_FILTER = ["20181127_TC1", "20190319_TC2", "20190111_BC1", "20190321_BC2", "20190711_TC4", "20190712_TC5", "20190713_TC6", "20190713_TC7", "20200720_BC4_1", "20200720_BC4_2", "20210719_BC5", "20210720_BC6"]
INJECTED_CELL_SETS_FILTER = AMPLIFIED_CELL_SETS_FILTER + ["20200807_C1", "20210726_TC11", "20210728_C5", "20210728_TC12", "20200907_C3", "20201119_C4", "20200903_TC8", "20200904_TC9"]

In [None]:
samples_dict = {}

for cell_set in database.CELL_SETS:
    
    if cell_set.source_tissue.animal is None:
        continue
    
    if cell_set.source_tissue.region != "Cortex":
        continue
    
    num_transcriptome_reads = 0
    num_virus_reads = 0
    
    for sequencing_library in cell_set.sequencing_libraries:
        for read_set in sequencing_library.read_sets:
            if sequencing_library.type == "Transcriptome":
                num_transcriptome_reads += read_set.num_reads
                
    if cell_set.name in AMPLIFIED_CELL_SETS_FILTER:
        for sequencing_library in cell_set.sequencing_libraries:
                if sequencing_library.type == "Virus Transcripts":
                    num_virus_reads += read_set.num_reads
    
    adata_path = cell_set.get_anndata_file_path(alignment_name=ALIGNMENT_NAME, transcript_type="transcriptome")
    
    if not os.path.exists(adata_path):
        continue
    
    adata = anndata.read_h5ad(adata_path, backed="r")
    called_cells_mask = adata.obs["Cell Called"] == "True"
    typed_cells_mask = ~(adata.obs[CELL_TYPE_TAXONOMY_NAME].isna())
    print(called_cells_mask.sum(), typed_cells_mask.sum())
    cellranger_called_cells_mask = adata.obs["Cell Ranger Called"] == True
    multiplets_mask = adata.obs[DROPLET_TAXONOMY_NAME] == "Multiplets"
    UMIs_per_cell = numpy.median(numpy.array(adata[called_cells_mask].X.sum(axis=1)).flatten())
    genes_per_cell = numpy.median((adata[called_cells_mask].X > 0).toarray().sum(axis=1))
    
    num_called_cells = called_cells_mask.sum()
    
    vector_cargos = set()
    viruses = set()
        
    for injection in cell_set.source_tissue.animal.injections:
        for vector in injection.vector_pool.vectors:
            vector_cargos.add("%s %s" % (vector.delivery_vehicle.name, vector.cargo.name))
            viruses.add(vector.delivery_vehicle.name)
    
    if len(viruses) == 0:
        for injection in cell_set.source_tissue.animal.injections:
            vector_cargos.add(injection.vector_pool.name)
            
    
    if cell_set.name in INJECTED_CELL_SETS_FILTER:
        vector_cargos = ";".join(vector_cargos)
        age_at_injection = (cell_set.source_tissue.animal.injection_date - cell_set.source_tissue.animal.DOB).days
        incubation_time = (cell_set.source_tissue.animal.extraction_date - cell_set.source_tissue.animal.injection_date).days
    else:
        vector_cargos = "Cell Typing Only"
        age_at_injection = 0
        incubation_time = 0
        
    if cell_set.name in AMPLIFIED_CELL_SETS_FILTER:
        virus_adata_path = cell_set.get_anndata_file_path(alignment_name=VIRUS_ALIGNMENT_NAME, transcript_type="transcriptome")
        virus_adata = anndata.read_h5ad(virus_adata_path, backed="r")
        num_disambiguated_virus_barcode_UMIs = virus_adata[called_cells_mask].obs.loc[:, viruses].sum().sum()
        num_virus_barcode_UMIs = virus_adata[called_cells_mask, "AAV"].X.sum()
        virus_disambiguation_rate = num_disambiguated_virus_barcode_UMIs/num_virus_barcode_UMIs
    else:
        virus_disambiguation_rate = 0
        
    if cell_set.source_tissue.animal.DOB is None:
        age_at_extraction = 0
    else:
        age_at_extraction = (cell_set.source_tissue.animal.extraction_date - cell_set.source_tissue.animal.DOB).days
        
    samples_dict[cell_set.name] = {
        "10X Version": int(cell_set.source_tissue.dissociation_run.protocol_version),
        "Animal ID": cell_set.source_tissue.animal.name,
        "Target # Cells": int(cell_set.target_num_cells),
        "Recovered Cells": num_called_cells,
        "Typed Cells": typed_cells_mask.sum(),
        "Cell Ranger # Cells": cellranger_called_cells_mask.sum(),
        "Predicted Multiplets": multiplets_mask.sum(),
        "Transcriptome Sequencing Depth": num_transcriptome_reads,
        "Transcriptome Reads/Cell": num_transcriptome_reads/num_called_cells,
        "Median UMIs/Cell": UMIs_per_cell,
        "Median Genes/Cell": genes_per_cell,
        "Variants Recovered": vector_cargos,
        "Virus Sequencing Depth": num_virus_reads,
        "Virus Reads/Cell": num_virus_reads/num_called_cells,
        "Age at Extraction (Days)": age_at_extraction,
        "Virus Incubation time (Days)": incubation_time,
        "Percent of Virus UMIs Determined": virus_disambiguation_rate * 100
    }
    
    display(pandas.DataFrame.from_dict(samples_dict, orient="index"))
    
df = pandas.DataFrame.from_dict(samples_dict, orient="index")
df.to_csv(os.path.join(database.DATA_PATH, "aavomics_sample_metadata.csv"))