In [None]:
import os

from aavomics import database
from aavomics import aavomics
import anndata
import pandas
import numpy
from scipy import stats
from statsmodels.stats import proportion
from statsmodels.stats.multitest import multipletests

import plotly.graph_objects as graph_objects
from plotly import offline as plotly

In [None]:
TRANSDUCTION_RATE_FILE_NAME = "aavomics_cell_type_transduction_rates.csv"
MIN_CELL_COUNT = 10

CELL_SET_NAMES = ["20181127_TC1", "20190319_TC2", "20190111_BC1", "20190321_BC2"]

VIRUS_NAMES = ["PHP.V1", "PHP.eB"]

CELL_TYPE_HIERARCHY = {
    "Astrocytes": {
        "Myoc- Astrocytes": {},
        "Myoc+ Astrocytes": {}
    },
    "Vascular Cells": {
        "Endothelial Cells": {},
        "Pericytes": {},
        "Red Blood Cells": {},
        "Vascular SMCs": {},
        "VLMCs": {}
    },
    "Immune Cells": {
        "Perivascular Macrophages": {},
        "Microglia": {},
        "Leukocytes": {}
    },
    "Oligodendrocytes": {
        "OPCs": {},
        "Committed Oligodendrocytes": {},
        "Mature Oligodendrocytes": {}
    }
}

In [None]:
transduction_rate_df = pandas.read_csv(os.path.join(database.DATA_PATH, TRANSDUCTION_RATE_FILE_NAME), index_col=0)

In [None]:
all_cell_types = set()

for column_name in transduction_rate_df.columns:
    if column_name.endswith("Transduction Rate"):
        all_cell_types.add(" ".join(column_name.split()[0:-2]))

In [None]:
virus_cell_set_cell_type_transduction_counts = {}

for virus_name in VIRUS_NAMES:
    
    virus_cell_set_cell_type_transduction_counts[virus_name] = {}
    cell_set_cell_type_cell_counts = {}
    
    for cell_set_name in CELL_SET_NAMES:
        
        virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name] = {}
        virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name]["All Cells"] = 0
        cell_set_cell_type_cell_counts[cell_set_name] = {}
        cell_set_cell_type_cell_counts[cell_set_name]["All Cells"] = 0
    
        cell_set_virus_mask = (transduction_rate_df["Cell Set"] == cell_set_name) & \
            (transduction_rate_df["Virus"] == virus_name)
    
        row = transduction_rate_df[cell_set_virus_mask].iloc[0]
        
        for parent_cell_type in CELL_TYPE_HIERARCHY:
            
            virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name][parent_cell_type] = 0
            
            for cell_type in CELL_TYPE_HIERARCHY[parent_cell_type]:
    
                num_cells = row["%s Num Cells" % cell_type]
                transduction_rate = row["%s Transduction Rate" % cell_type]

                if numpy.isnan(num_cells):
                    num_transduced = 0
                else:
                    num_transduced = num_cells * transduction_rate
                    
                virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name][parent_cell_type] += num_transduced
                virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name]["All Cells"] += num_transduced
                virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name][cell_type] = num_transduced

In [None]:
parent_cell_type_map = {}

for cell_type_name in CELL_TYPE_HIERARCHY:
    
    parent_cell_type_map[cell_type_name] = "All Cells"
    
    for cell_subtype_name in CELL_TYPE_HIERARCHY[cell_type_name]:
        
        parent_cell_type_map[cell_subtype_name] = cell_type_name

In [None]:
cell_type_names = []

scatter_x_values = []
scatter_y_values = []
scatter_text = []
z_scores = []
p_values = []
scatter_marker_colors = []

traces = []

cell_type_index = 0

z_scores_by_cell_set = {x: [] for x in CELL_SET_NAMES}

for cell_type_name in CELL_TYPE_HIERARCHY:
    
    for cell_subtype_index, cell_subtype_name in enumerate([cell_type_name] + list(CELL_TYPE_HIERARCHY[cell_type_name].keys())):
        
        y_values = []
        marker_colors = []
        
        parent_cell_type_name = parent_cell_type_map[cell_subtype_name]
        
        cell_type_z_scores = []
        cell_type_p_values = []
        
        
        
        for cell_set_name in CELL_SET_NAMES:
            
            if "BC" in cell_set_name:
                scatter_marker_colors.append("blue")
            else:
                scatter_marker_colors.append("red")
            
            virus_counts = []
            parent_virus_counts = []
            virus_rates = []
            
            for virus_index, virus_name in enumerate(VIRUS_NAMES):
                
                if cell_subtype_name not in virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name]:
                    break
            
                virus_count = virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name][cell_subtype_name]
                parent_virus_count = virus_cell_set_cell_type_transduction_counts[virus_name][cell_set_name]["All Cells"]
                
                if parent_virus_count == 0:
                    break
                    
                virus_counts.append(virus_count)
                parent_virus_counts.append(parent_virus_count)
            
            if len(VIRUS_NAMES) != len(virus_counts):
                continue
            
            virus_counts = numpy.array(virus_counts)
            
            virus_rates = virus_counts/numpy.array(parent_virus_counts)
            
            virus_counts = numpy.round(virus_counts).astype(numpy.uint16)
            parent_virus_counts = numpy.round(parent_virus_counts).astype(numpy.uint16)
            
            if numpy.any(virus_counts < MIN_CELL_COUNT) or numpy.any((parent_virus_counts - virus_counts) < MIN_CELL_COUNT):
                z = 0
                cell_type_z_scores.append(0)
                cell_type_p_values.append(1)
            else:
                z, p = proportion.proportions_ztest(virus_counts, numpy.round(parent_virus_counts))
                cell_type_z_scores.append(z)
                cell_type_p_values.append(p)
            
            y_value = (virus_rates[0] - virus_rates[1]) * 100

            y_values.append(y_value)
            scatter_text.append(cell_set_name)
        
            z_scores_by_cell_set[cell_set_name].append(z)
    
        cell_type_names.append(cell_subtype_name)
            
        if len(cell_type_z_scores) > 0:
            cell_type_z_score, cell_type_p_value = stats.combine_pvalues(stats.norm.sf(cell_type_z_scores), method="stouffer")
        else:
            cell_type_z_score = 0
            cell_type_p_value = 1
        
        z_scores.append(cell_type_z_score)
        p_values.append(cell_type_p_value)
        
        x_values = [cell_type_index]*len(y_values)
        
        if cell_subtype_name == cell_type_name:
            box_line_color = "black"
        else:
            box_line_color = "grey"
            
        box_trace = graph_objects.Box(
            x=x_values,
            y=y_values,
            line={
                "color": box_line_color
            },
            showlegend=False
        )
        
        x_values_jittered = numpy.array(x_values) - 1/4 + numpy.random.rand(len(x_values))/6
        
        scatter_y_values.extend(y_values)
        scatter_x_values.extend(x_values)
        
        traces.append(box_trace)
        
        cell_type_index += 1
        
p_values = stats.norm.sf(numpy.abs(z_scores))*2
multitest_results = multipletests(p_values, method="fdr_bh", alpha=0.05)
are_significant = multitest_results[0]
        
scatter_trace = graph_objects.Scatter(
    x=scatter_x_values,
    y=scatter_y_values,
    mode="markers",
    text=scatter_text,
    marker={
        "color": scatter_marker_colors
    },
    showlegend=False
)

traces.append(scatter_trace)
    
layout = {}

cell_type_labels = []

for cell_type_name in cell_type_names:
    
    transduced_counts = []
    
    for virus_name in VIRUS_NAMES:
        
        transduced_counts.append(sum([virus_cell_set_cell_type_transduction_counts[virus_name][y][cell_type_name] for y in CELL_SET_NAMES]))
    
    cell_type_labels.append("%s (%i, %i)" % (cell_type_name, numpy.round(transduced_counts[0]), numpy.round(transduced_counts[1])))
    

layout["height"] = 600
layout["width"] = 800
layout["plot_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["paper_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["xaxis"] = {
    "tickvals": list(range(len(cell_type_names))),
    "ticktext": cell_type_labels
}
layout["yaxis"] = {
    "zerolinecolor": "grey",
    "gridcolor": "rgba(0, 0, 0, 0.25)",
    "zerolinewidth": 2,
    "title": {
        "text": "Delta p (%)",
    }
}

layout["title"] = {
    "text":"%s Bias Over %s Appears in Both XFP- and Barcode-based Pooling" % (VIRUS_NAMES[0], VIRUS_NAMES[1])
}

figure = graph_objects.Figure(data=traces, layout=layout)

cell_type_index = 0

for cell_type_name in CELL_TYPE_HIERARCHY:
    
    for cell_subtype_index, cell_subtype_name in enumerate([cell_type_name] + list(CELL_TYPE_HIERARCHY[cell_type_name].keys())):
    
        if are_significant[cell_type_index]:

            figure.add_annotation(
                x=cell_type_index,
                y=max(scatter_y_values),
                text="*",
                showarrow=False
            )
            
        cell_type_index += 1
    
    line_location = cell_type_index - 1/2

    figure.add_shape(
        type="line",
        x0=line_location, y0=min(scatter_y_values), x1=line_location, y1=max(scatter_y_values),
        line=dict(
            width=2,
            dash="dash",
        )
    )

plotly.iplot(figure)

In [None]:
figure.write_image(os.path.join("out", "PHP-eB_vs_PHP-V1_delta_fraction_transduced.svg"))