In [None]:
import os

from aavomics import database
from aavomics import aavomics
import anndata
import pandas
import numpy
from scipy import stats
from statsmodels.stats.multitest import multipletests

import plotly.graph_objects as graph_objects
from plotly import offline as plotly

In [None]:
VIRUS_NAMES = ["PHP.eB", "PHP.V1"]

TRANSDUCTION_RATE_FILE_NAME = "aavomics_cell_type_transduction_rates.csv"

In [None]:
CELL_SET_NAMES = ["20181127_TC1", "20190319_TC2", "20190111_BC1", "20190321_BC2"]

CELL_TYPE_HIERARCHY = {
    "Astrocytes": {
        "Myoc- Astrocytes": {},
        "Myoc+ Astrocytes": {}
    },
    "Vascular Cells": {
        "Endothelial Cells": {},
        "Pericytes": {},
        "Red Blood Cells": {},
        "Vascular SMCs": {},
        "VLMCs": {}
    },
    "Immune Cells": {
        "Microglia": {},
        "Perivascular Macrophages": {},
        "Leukocytes": {}
    },
    "Oligodendrocytes": {
        "OPCs": {},
        "Committed Oligodendrocytes": {},
        "Mature Oligodendrocytes": {}
    }
}


In [None]:
transduction_rate_df = pandas.read_csv(os.path.join(database.DATA_PATH, TRANSDUCTION_RATE_FILE_NAME), index_col=0)

all_cell_types = set()

for column_name in transduction_rate_df.columns:
    if column_name.endswith("Transduction Rate"):
        all_cell_types.add(" ".join(column_name.split()[0:-2]))

In [None]:
transduction_data = []

for virus_name in VIRUS_NAMES:
    
    for cell_set_name in CELL_SET_NAMES:
        
        cell_set_virus_mask = (transduction_rate_df["Cell Set"] == cell_set_name) & \
            (transduction_rate_df["Virus"] == virus_name)
    
        row = transduction_rate_df[cell_set_virus_mask].iloc[0]
        
        cell_type_num_transduced = {cell_type: 0 for cell_type in CELL_TYPE_HIERARCHY}
            
        total_num_transduced = 0
    
        for cell_type in all_cell_types:

            num_cells = row["%s Num Cells" % cell_type]
            transduction_rate = row["%s Transduction Rate" % cell_type]

            if numpy.isnan(num_cells):
                continue

            num_transduced = num_cells * transduction_rate

            counts = False

            for parent_cell_type, cell_subtypes in CELL_TYPE_HIERARCHY.items():
                if cell_type in cell_subtypes:
                    cell_type_num_transduced[parent_cell_type] += num_transduced
                    counts = True

            if counts:
                total_num_transduced += num_transduced

        for cell_type_name in CELL_TYPE_HIERARCHY:
            
            transduction_data.append((virus_name, cell_set_name, cell_type_name, cell_type_num_transduced[cell_type_name]/total_num_transduced*100))

In [None]:
transduction_data

In [None]:
barcode_transduction_data = []

for virus_name in VIRUS_NAMES:
    
    for cell_set_name in CELL_SET_NAMES:
        
        cell_set_virus_mask = (transduction_rate_df["Cell Set"] == cell_set_name) & \
            (transduction_rate_df["Virus"].str.contains(virus_name)) & \
            (transduction_rate_df["Virus"].str.contains("BC"))
        
        for row in transduction_rate_df[cell_set_virus_mask].iterrows():
    
            row = row[1]

            cell_type_num_transduced = {cell_type: 0 for cell_type in CELL_TYPE_HIERARCHY}

            total_num_transduced = 0

            for cell_type in all_cell_types:

                num_cells = row["%s Num Cells" % cell_type]
                transduction_rate = row["%s Transduction Rate" % cell_type]

                if numpy.isnan(num_cells):
                    continue

                num_transduced = num_cells * transduction_rate

                counts = False

                for parent_cell_type, cell_subtypes in CELL_TYPE_HIERARCHY.items():
                    if cell_type in cell_subtypes:
                        cell_type_num_transduced[parent_cell_type] += num_transduced
                        counts = True

                if counts:
                    total_num_transduced += num_transduced

            for cell_type_name in CELL_TYPE_HIERARCHY:

                barcode_transduction_data.append((virus_name, cell_set_name, cell_type_name, cell_type_num_transduced[cell_type_name]/total_num_transduced*100))

In [None]:
barcode_transduction_data

In [None]:
cell_types = set([x[2] for x in transduction_data])
cell_types_list = ["Astrocytes", "Vascular Cells", "Oligodendrocytes", "Immune Cells"]
p_values = []

all_cell_type_values = []
all_cell_type_cell_set_values = []
cell_type_weights = []

for virus_name in VIRUS_NAMES:

    for cell_type_index, cell_type in enumerate(cell_types_list):
        
        cell_type_values = []
        cell_type_cell_set_values = []

        for cell_set_index, cell_set_name in enumerate(CELL_SET_NAMES):
            
            values_these_barcodes = []
            
            for entry in barcode_transduction_data:
                
                if entry[0] != virus_name:
                    continue
                if entry[1] != cell_set_name:
                    continue
                if entry[2] != cell_type:
                    continue
                    
                values_these_barcodes.append(entry[3])
            
            if len(values_these_barcodes) > 1:
                
                cell_type_cell_set_values.append(values_these_barcodes)
                
#         print(virus_name, cell_type, cell_type_cell_set_values)
            
        for entry in transduction_data:
                            
            if entry[0] != virus_name:
                continue
            if entry[2] != cell_type:
                continue

            cell_type_values.append(entry[3])

    
        this_cell_type_p_values = []
        
        inter_sample_variance = numpy.std(cell_type_values)
        
#         print(virus_name, cell_type, inter_sample_variance)
        
        if numpy.isinf(inter_sample_variance) or inter_sample_variance == 0:
            continue
        
        for sample_barcode_values in cell_type_cell_set_values:
            
            z, p = stats.bartlett(cell_type_values, sample_barcode_values)
            
            intra_sample_variance = numpy.std(sample_barcode_values)
            
            if numpy.isinf(z) or intra_sample_variance == 0:
                continue
            
            # Convert to 1-tailed test - we are testing the hypothesis that inter-sample variance is greater than intra
            if inter_sample_variance > intra_sample_variance:
                p = p/2
            else:
                p = 1-p/2
                
            this_cell_type_p_values.append(p)
            all_cell_type_cell_set_values.append(intra_sample_variance)
    
        if len(this_cell_type_p_values) == 0:
            continue
            
        cell_type_weights.append(numpy.mean(cell_type_values))
            
        # Since we are comparing every set of barcodes to the same set of animal values, 
        # these are not independent tests. So, we correct and take the mean to be conservative
        
        _, corrected_values, _, _ = multipletests(this_cell_type_p_values, method="fdr_bh")
        p_value = numpy.mean(corrected_values)
        
        print(virus_name, cell_type)
        print(p_value)
        
        p_values.append(p_value)
        
        all_cell_type_values.append(inter_sample_variance)

stats.combine_pvalues(p_values, method="stouffer", weights=cell_type_weights)

In [None]:
trace1 = graph_objects.Box(
    y=all_cell_type_values,
    boxpoints="all",
    name="Inter-Sample Variance"
)

trace2 = graph_objects.Box(
    y=all_cell_type_cell_set_values,
    boxpoints="all",
    name="Intra-Sample Variance"
)

layout = {}

layout["height"] = 600
layout["width"] = 800
layout["plot_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["paper_bgcolor"] = "rgba(255, 255, 255, 0)"
layout["yaxis"] = {
    "title": {
        "text": "Standard Deviation",
    },
    "gridcolor": "rgba(0, 0, 0, 0.25)",
    "zerolinecolor": "rgba(0, 0, 0, 0.25)"
}
layout["title"] = {
    "text":"Sample vs Barcode Variance"
}
layout["showlegend"] = False
figure = graph_objects.Figure(data=[trace1, trace2], layout=layout)

plotly.iplot(figure)

figure.write_image(os.path.join("out", "sample_vs_barcode_variance.svg"))