In [None]:
import os
import pandas

from aavomics import database
import numpy
from plotly import graph_objects
from plotly import offline as plotly
from aavomics import aavomics
import anndata
from scipy import stats

In [None]:
ALIGNMENT_NAME = "cellranger_5.0.1_gex_mm10_2020_A_AAVomics"
TAXONOMY_ALIGNMENT_NAME = "cellranger_5.0.1_gex_mm10_2020_A"

DEBRIS_TAXONOMY_NAME = "CCN202105041"
TAXONOMY_NAME = "CCN202105060"
CELL_SET_NAME = "20190713_TC6"
VIRUS_NAME_1 = "CAP-B10"
VIRUS_NAME_2 = "PHP.eB"

TRANSDUCTION_RATE_METHOD = aavomics.Infection_Rate_Method.COUNTING
TRANSDUCTION_THRESHOLD = None

In [None]:
CELL_TYPE_HIERARCHY = {
    "Neurons": {
        "L2": {},
        "L2/3": {},
        "L3": {},
        "L4/5": {},
        "L5": {},
        "L5/6": {},
        "L6": {},
        "Pvalb": {},
        "Sst": {},
        "Vip": {},
        "Sncg": {},
        "Pax6": {},
        "Lamp5": {}
    },
    "Astrocytes": {
        "Myoc- Astrocytes": {},
        "Myoc+ Astrocytes": {}
    },
    "Vascular Cells": {
        "Endothelial Cells": {},
        "Pericytes": {},
        "Red Blood Cells": {},
        "Vascular SMCs": {},
        "VLMCs": {}
    },
    "Immune Cells": {
        "Microglia": {},
        "Perivascular Macrophages": {},
        "Leukocytes": {}
    },
    "Oligodendrocytes": {
        "OPCs": {},
        "OPCs": {},
        "Committed Oligodendrocytes": {},
        "Mature Oligodendrocytes": {}
    }
}

In [None]:
cell_set = database.CELL_SETS_DICT[CELL_SET_NAME]

adata = anndata.read_h5ad(cell_set.get_anndata_file_path(alignment_name=ALIGNMENT_NAME))
taxonomy_adata = anndata.read_h5ad(cell_set.get_anndata_file_path(alignment_name=TAXONOMY_ALIGNMENT_NAME))

called_cells_adata = taxonomy_adata[~taxonomy_adata.obs[TAXONOMY_NAME].isna()].copy()
cell_types = called_cells_adata.obs[TAXONOMY_NAME].unique()

In [None]:
subsample_rates = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 300, 400, 500]
NUM_ITERATIONS = 5
x_values_by_cell_type = {cell_type: [] for cell_type in CELL_TYPE_HIERARCHY}
y_values_by_cell_type = {cell_type: [] for cell_type in CELL_TYPE_HIERARCHY}
y_errors_by_cell_type = {cell_type: [] for cell_type in CELL_TYPE_HIERARCHY}

for subsample_rate in subsample_rates:
        
    virus_cell_type_fraction_of_transduced = {}
    
    for virus_name in [VIRUS_NAME_1, VIRUS_NAME_2]:
    
        print("Testing subsample rate of %i" % subsample_rate)

        debris_counts = adata[taxonomy_adata.obs[DEBRIS_TAXONOMY_NAME] == "Debris"].obs[virus_name].values
        debris_droplet_indices = list(range(len(debris_counts)))

        total_debris_counts = debris_counts.sum()

        num_debris_counts_to_sample = int(numpy.round(total_debris_counts/subsample_rate))

        subsampled_debris_indices = numpy.random.choice(debris_droplet_indices, replace=True, size=num_debris_counts_to_sample, p=debris_counts/total_debris_counts)
        _, subsampled_non_zero_counts = numpy.unique(subsampled_debris_indices, return_counts=True)

        subsampled_debris_counts = list(subsampled_non_zero_counts) + [0] * (len(debris_counts) - len(subsampled_non_zero_counts))

        virus_cell_type_fraction_of_transduced[virus_name] = {cell_type: [] for cell_type in CELL_TYPE_HIERARCHY}

        for iteration in range(NUM_ITERATIONS):

            cell_type_num_transduced = {}
            total_num_transduced = 0

            for cell_type in CELL_TYPE_HIERARCHY:

                cell_type_mask = taxonomy_adata.obs[TAXONOMY_NAME].isin(CELL_TYPE_HIERARCHY[cell_type])

                cell_type_counts = adata[cell_type_mask].obs[virus_name].values

                cell_indices = list(range(len(cell_type_counts)))

                total_cell_type_counts = cell_type_counts.sum()

                num_cell_counts_to_sample = int(numpy.round(total_cell_type_counts/subsample_rate))

                subsampled_cell_type_indices = numpy.random.choice(cell_indices, replace=True, size=num_cell_counts_to_sample, p=cell_type_counts/total_cell_type_counts)
                _, subsampled_non_zero_counts = numpy.unique(subsampled_cell_type_indices, return_counts=True)

                subsampled_cell_counts = list(subsampled_non_zero_counts) + [0] * (len(cell_type_counts) - len(subsampled_non_zero_counts))
                
                print(subsample_rate, virus_name, cell_type, sum(subsampled_cell_counts))
                virus_rate = aavomics.get_transcript_presence_rate(
                    numpy.array(subsampled_cell_counts),
                    method=TRANSDUCTION_RATE_METHOD,
                    background_transcript_counts=numpy.array(subsampled_debris_counts),
                    resolution=400,
                    threshold=TRANSDUCTION_THRESHOLD
                )

                if isinstance(virus_rate, tuple):
                    virus_rate = virus_rate[0]

                cell_type_num_transduced[cell_type] = (virus_rate * cell_type_mask.sum())
                total_num_transduced += virus_rate * cell_type_mask.sum()

            for cell_type in CELL_TYPE_HIERARCHY:
                if total_num_transduced == 0:
                    virus_cell_type_fraction_of_transduced[virus_name][cell_type].append(0)
                else:
                    virus_cell_type_fraction_of_transduced[virus_name][cell_type].append(cell_type_num_transduced[cell_type]/total_num_transduced)

    for cell_type in CELL_TYPE_HIERARCHY:
        
        x_values_by_cell_type[cell_type].append(subsample_rate)
        
        values = []
        for i in range(NUM_ITERATIONS):
            value = virus_cell_type_fraction_of_transduced[VIRUS_NAME_1][cell_type][i] - virus_cell_type_fraction_of_transduced[VIRUS_NAME_2][cell_type][i]
            values.append(value)
        y_values_by_cell_type[cell_type].append(numpy.mean(values))
        
        y_errors_by_cell_type[cell_type].append(stats.sem(values))

In [None]:
sum(subsampled_cell_counts)

In [None]:
traces = []

for cell_type in CELL_TYPE_HIERARCHY:
    
    trace = graph_objects.Scatter(
        x=x_values_by_cell_type[cell_type],
        y=y_values_by_cell_type[cell_type],
        name=cell_type,
        mode="lines+markers",
        error_y={
            "array": y_errors_by_cell_type[cell_type]
        }
    )
    
    traces.append(trace)

layout = {
    "xaxis": {
        "type": "log"
    },
    "plot_bgcolor": "rgba(255, 255, 255, 0)",
    "paper_bgcolor": "rgba(255, 255, 255, 0)"
}
    
figure = graph_objects.Figure(data=traces, layout=layout)

plotly.iplot(figure)

figure.write_image(os.path.join("out", "%s_%s_vs_%s_cell_type_tropism_simulated.svg" % (CELL_SET_NAME, VIRUS_NAME_1, VIRUS_NAME_2)))

In [None]:
cell_subtypes = []

for cell_type in CELL_TYPE_HIERARCHY:
    cell_subtypes.extend(CELL_TYPE_HIERARCHY[cell_type])

subsample_rates = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 300, 400, 500]
NUM_ITERATIONS = 5
x_values_by_cell_type = {cell_type: [] for cell_type in cell_subtypes}
y_values_by_cell_type = {cell_type: [] for cell_type in cell_subtypes}
y_errors_by_cell_type = {cell_type: [] for cell_type in cell_subtypes}

for subsample_rate in subsample_rates:
        
    virus_cell_type_fraction_of_transduced = {}
    
    for virus_name in [VIRUS_NAME_1, VIRUS_NAME_2]:
    
        print("Testing subsample rate of %i" % subsample_rate)

        debris_counts = adata[taxonomy_adata.obs[DEBRIS_TAXONOMY_NAME] == "Debris"].obs[virus_name].values
        debris_droplet_indices = list(range(len(debris_counts)))

        total_debris_counts = debris_counts.sum()

        num_debris_counts_to_sample = int(numpy.round(total_debris_counts/subsample_rate))

        subsampled_debris_indices = numpy.random.choice(debris_droplet_indices, replace=True, size=num_debris_counts_to_sample, p=debris_counts/total_debris_counts)
        _, subsampled_non_zero_counts = numpy.unique(subsampled_debris_indices, return_counts=True)

        subsampled_debris_counts = list(subsampled_non_zero_counts) + [0] * (len(debris_counts) - len(subsampled_non_zero_counts))

        virus_cell_type_fraction_of_transduced[virus_name] = {cell_type: [] for cell_type in cell_subtypes}

        for iteration in range(NUM_ITERATIONS):

            cell_type_num_transduced = {}
            total_num_transduced = 0

            for cell_type in cell_subtypes:

                cell_type_mask = taxonomy_adata.obs[TAXONOMY_NAME].isin([cell_type])

                cell_type_counts = adata[cell_type_mask].obs[virus_name].values
                
                if cell_type_counts.sum() == 0:
                    cell_type_num_transduced[cell_type] = 0
                    continue

                cell_indices = list(range(len(cell_type_counts)))

                total_cell_type_counts = cell_type_counts.sum()

                num_cell_counts_to_sample = int(numpy.round(total_cell_type_counts/subsample_rate))

                subsampled_cell_type_indices = numpy.random.choice(cell_indices, replace=True, size=num_cell_counts_to_sample, p=cell_type_counts/total_cell_type_counts)
                _, subsampled_non_zero_counts = numpy.unique(subsampled_cell_type_indices, return_counts=True)

                subsampled_cell_counts = list(subsampled_non_zero_counts) + [0] * (len(cell_type_counts) - len(subsampled_non_zero_counts))
                
                print(subsample_rate, virus_name, cell_type, sum(subsampled_cell_counts))
                
                virus_rate = aavomics.get_transcript_presence_rate(
                    numpy.array(subsampled_cell_counts),
                    method=TRANSDUCTION_RATE_METHOD,
                    background_transcript_counts=numpy.array(subsampled_debris_counts),
                    resolution=400,
                    threshold=TRANSDUCTION_THRESHOLD
                )

                if isinstance(virus_rate, tuple):
                    virus_rate = virus_rate[0]

                cell_type_num_transduced[cell_type] = (virus_rate * cell_type_mask.sum())
                total_num_transduced += virus_rate * cell_type_mask.sum()

            for cell_type in cell_subtypes:
                if total_num_transduced == 0:
                    virus_cell_type_fraction_of_transduced[virus_name][cell_type].append(0)
                else:
                    virus_cell_type_fraction_of_transduced[virus_name][cell_type].append(cell_type_num_transduced[cell_type]/total_num_transduced)

    for cell_type in cell_subtypes:
        
        x_values_by_cell_type[cell_type].append(subsample_rate)
        
        values = []
        for i in range(NUM_ITERATIONS):
            value = virus_cell_type_fraction_of_transduced[VIRUS_NAME_1][cell_type][i] - virus_cell_type_fraction_of_transduced[VIRUS_NAME_2][cell_type][i]
            values.append(value)
        y_values_by_cell_type[cell_type].append(numpy.mean(values))
        
        y_errors_by_cell_type[cell_type].append(stats.sem(values))

In [None]:
total_cell_type_counts

In [None]:
traces = []

for cell_type in cell_subtypes:
    
    trace = graph_objects.Scatter(
        x=x_values_by_cell_type[cell_type],
        y=y_values_by_cell_type[cell_type],
        name=cell_type,
        mode="lines+markers",
        error_y={
            "array": y_errors_by_cell_type[cell_type]
        }
    )

    traces.append(trace)
        
layout = {
    "xaxis": {
        "type": "log"
    },
    "plot_bgcolor": "rgba(255, 255, 255, 0)",
    "paper_bgcolor": "rgba(255, 255, 255, 0)"
}
    
figure = graph_objects.Figure(data=traces, layout=layout)

plotly.iplot(figure)

figure.write_image(os.path.join("out", "%s_%s_vs_%s_cell_subtype_tropism_simulated.svg" % (CELL_SET_NAME, VIRUS_NAME_1, VIRUS_NAME_2)))