In [None]:
from aavomics import database
import os
import pandas
import numpy
import anndata
import scanpy

from plotly import offline as plotly
from plotly import graph_objects
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly import graph_objects
from statsmodels.distributions.empirical_distribution import ECDF
import matplotlib.pyplot as plt
import dc_stat_think as dcst
from plotly.subplots import make_subplots
from statsmodels.stats.multitest import multipletests


In [None]:
def get_stats_for_gene(gene_list,cell_condition_df):
    lfcs = []
    for gene in gene_list:       
        if gene not in cell_condition_df['Gene Name'].values:
            gene_lfc = 0
        else:
            gene_lfc = cell_condition_df[cell_condition_df['Gene Name']==gene]['stat'].values[0]
        lfcs.append(gene_lfc)
    return lfcs


def get_score_sorted_genes(gene_score_dict):

    sorted_values = sorted(gene_score_dict.values()) 
    sorted_dict = {}

    for i in sorted_values:
        for k in gene_score_dict.keys():
            if gene_score_dict[k] == i:
                sorted_dict[k] = gene_score_dict[k]
                break
                
    sorted_values = list(sorted_dict.values())
    sorted_gene_list = list(sorted_dict.keys())
    return sorted_gene_list,sorted_values


In [None]:
TRANSDUCTION_RATE_FILE_NAME = "aavomics_cell_type_transduction_rates.csv"
ANNDATA_FILE_NAME = "aavomics_mouse_cortex_2021.h5ad"
transduction_rate_df = pandas.read_csv(os.path.join(database.DATA_PATH, TRANSDUCTION_RATE_FILE_NAME), index_col=0)
adata = anndata.read_h5ad(os.path.join(database.DATA_PATH, ANNDATA_FILE_NAME))

In [None]:
CELL_TYPE_NAME_LABELS = {
    "Pericytes": ["Pericytes"],
    "Endothelial Cells": ["Endothelial Cells"],
    "VLMCs": ["VLMCs"],
    "Vascular SMCs": ["Vascular SMCs"],
    "Excitatory Neurons": ["L4/5", "L2", "L3", "L6", "L2/3", "L5", "L5/6"],
    "Inhibitory Neurons": ["Sncg", "Lamp5", "Pvalb", "Vip", "Sst", "Pax6"],
    "Myoc+ Astrocytes": ["Myoc+ Astrocytes"],
    "Myoc- Astrocytes": ["Myoc- Astrocytes"],
    "OPCs": ["OPCs"],
    "Committed Oligodendrocytes": ["Committed Oligodendrocytes"],
    "Mature Oligodendrocytes": ["Mature Oligodendrocytes"],
    "Microglia": ["Microglia"],
    "Perivascular Macrophages": ["Perivascular Macrophages"]
}

In [None]:
CONDITION_SAMPLE_DICT = {
    "3DPI": [
        "20200903_TC8","20200904_TC9","20210728_TC12"
    ],
    "Control": [
        "20200907_C3","20201119_C4","20210728_C5"
    ]
}

sample_condition_dict = {}

for condition, samples in CONDITION_SAMPLE_DICT.items():
    
    for sample in samples:
        sample_condition_dict[sample] = condition
        
    print(condition)

for cell_type in CELL_TYPE_NAME_LABELS:
    
    print(cell_type)
    
    cell_type_mask = adata.obs["Cell Type"].isin(CELL_TYPE_NAME_LABELS[cell_type])
    cell_type_adata = adata[cell_type_mask].copy()

        
    sample_gene_counts = numpy.zeros((cell_type_adata.shape[1], len(sample_condition_dict)))
    
    samples = []
    
    for sample_index, sample in enumerate(list(sample_condition_dict.keys())):
        
        cell_type_sample_adata = cell_type_adata[cell_type_adata.obs["Cell Set"] == sample]
        
        gene_sums = numpy.array(cell_type_sample_adata.X.sum(axis=0)).flatten()
        
        sample_gene_counts[:, sample_index] = gene_sums
        
        samples.append(sample)
        
    
    sample_gene_counts_df = pandas.DataFrame(sample_gene_counts, index=cell_type_adata.var.index, columns=samples)
    sample_gene_counts_df.index.name = "ensembl_id"
    
    sample_gene_counts_df.to_csv("%s_gene_counts_3DPI.csv" % cell_type, header=True)
    
cell_type_adata = adata.copy()

gene_mask = numpy.zeros((cell_type_adata.var.shape[0])).astype(numpy.bool)

for condition, samples in CONDITION_SAMPLE_DICT.items():

    sample_gene_mask = numpy.ones((cell_type_adata.var.shape[0])).astype(numpy.bool)

    for sample in samples:

        sample_gene_mask = sample_gene_mask & (numpy.array(cell_type_adata[cell_type_adata.obs["Cell Set"].isin([sample])].X.sum(axis=0)) > 0).flatten()

    gene_mask = gene_mask | sample_gene_mask

cell_type_adata = cell_type_adata[:, gene_mask].copy()

sample_gene_counts = numpy.zeros((gene_mask.sum(), len(sample_condition_dict)))

samples = []

for sample_index, sample in enumerate(list(sample_condition_dict.keys())):

    cell_type_sample_adata = cell_type_adata[cell_type_adata.obs["Cell Set"] == sample]

    gene_sums = numpy.array(cell_type_sample_adata.X.sum(axis=0)).flatten()

    sample_gene_counts[:, sample_index] = gene_sums

    samples.append(sample)


sample_gene_counts_df = pandas.DataFrame(sample_gene_counts, index=cell_type_adata.var.index, columns=samples)
sample_gene_counts_df.index.name = "ensembl_id"

sample_gene_counts_df.to_csv("%s_gene_counts_3DPI.csv" % "All Cells", header=True)

metadata_df = pandas.DataFrame.from_dict(sample_condition_dict, orient="index", columns=["Condition"])
metadata_df["Batch"] = metadata_df.index.values
metadata_df.index.name = "Sample"
metadata_df.to_csv("sample_metadata_3DPI.csv", header=True)

In [None]:
CONDITION_SAMPLE_DICT = {
    "25DPI": [
        "20190713_TC7","20201120_TC10","20210726_TC11"
    ],
    "Control": [
        "20200907_C3","20201119_C4","20210728_C5",
    ]
}

sample_condition_dict = {}

for condition, samples in CONDITION_SAMPLE_DICT.items():
    
    for sample in samples:
        sample_condition_dict[sample] = condition
        
    print(condition)

for cell_type in CELL_TYPE_NAME_LABELS:
    
    print(cell_type)
    
    cell_type_mask = adata.obs["Cell Type"].isin(CELL_TYPE_NAME_LABELS[cell_type])
    cell_type_adata = adata[cell_type_mask].copy()


    sample_gene_counts = numpy.zeros((cell_type_adata.shape[1], len(sample_condition_dict)))
    
    samples = []
    
    for sample_index, sample in enumerate(list(sample_condition_dict.keys())):
        
        cell_type_sample_adata = cell_type_adata[cell_type_adata.obs["Cell Set"] == sample]
        
        gene_sums = numpy.array(cell_type_sample_adata.X.sum(axis=0)).flatten()
        
        sample_gene_counts[:, sample_index] = gene_sums
        
        samples.append(sample)
        
    
    sample_gene_counts_df = pandas.DataFrame(sample_gene_counts, index=cell_type_adata.var.index, columns=samples)
    sample_gene_counts_df.index.name = "ensembl_id"
    
    sample_gene_counts_df.to_csv("%s_gene_counts_25DPI.csv" % cell_type, header=True)
    
cell_type_adata = adata.copy()

gene_mask = numpy.zeros((cell_type_adata.var.shape[0])).astype(numpy.bool)

for condition, samples in CONDITION_SAMPLE_DICT.items():

    sample_gene_mask = numpy.ones((cell_type_adata.var.shape[0])).astype(numpy.bool)

    for sample in samples:

        sample_gene_mask = sample_gene_mask & (numpy.array(cell_type_adata[cell_type_adata.obs["Cell Set"].isin([sample])].X.sum(axis=0)) > 0).flatten()

    gene_mask = gene_mask | sample_gene_mask

cell_type_adata = cell_type_adata[:, gene_mask].copy()

sample_gene_counts = numpy.zeros((gene_mask.sum(), len(sample_condition_dict)))

samples = []

for sample_index, sample in enumerate(list(sample_condition_dict.keys())):

    cell_type_sample_adata = cell_type_adata[cell_type_adata.obs["Cell Set"] == sample]

    gene_sums = numpy.array(cell_type_sample_adata.X.sum(axis=0)).flatten()

    sample_gene_counts[:, sample_index] = gene_sums

    samples.append(sample)


sample_gene_counts_df = pandas.DataFrame(sample_gene_counts, index=cell_type_adata.var.index, columns=samples)
sample_gene_counts_df.index.name = "ensembl_id"

sample_gene_counts_df.to_csv("%s_gene_counts_25DPI.csv" % "All Cells", header=True)

metadata_df = pandas.DataFrame.from_dict(sample_condition_dict, orient="index", columns=["Condition"])
metadata_df["Batch"] = metadata_df.index.values
metadata_df.index.name = "Sample"
metadata_df.to_csv("sample_metadata_25DPI.csv", header=True)

In [None]:
## Run Figure_5B.R using R 4.0.3 ##

In [None]:
num_de_genes_three_dpi = []
num_de_genes_twentyfive_dpi = []


for cell_type in CELL_TYPE_NAME_LABELS:
    
    for condition in ["3DPI", "25DPI"]:
        
        print("Condition: "+condition)
        print("Cell "+cell_type)
        
        file_name = "deseq2_out/%s_%s_deseq2.csv" % (cell_type, condition)
        
        de_df = pandas.read_csv(file_name, index_col=0)
        de_df["Gene Name"] = adata.var.loc[de_df.index.values]["Gene Name"]
        
        
        
        up = ((de_df["padj"] < 0.05) & (de_df["log2FoldChange"] > 0)).sum()
        down = ((de_df["padj"] < 0.05) & (de_df["log2FoldChange"] < 0)).sum()
        
        num_de_genes = up+down
        
        print(num_de_genes)
        
        if condition == "3DPI":
            num_de_genes_three_dpi.append(num_de_genes)
        elif condition == "25DPI":
            num_de_genes_twentyfive_dpi.append(num_de_genes)
        
        de_df.to_csv(file_name)

In [None]:
# Figure 5B #
CELL_LABELS = []


#for key in CELL_TYPE_NAME_LABELS.keys():
for key in CELL_TYPE_NAME_LABELS:
    CELL_LABELS.append(key)

fig = go.Figure()
fig.add_trace(go.Bar(x=CELL_LABELS, y=num_de_genes_three_dpi,
                base=0,
                marker_color='crimson',
                name='3DPI'))
fig.add_trace(go.Bar(x=CELL_LABELS, y=num_de_genes_twentyfive_dpi,
                base=0,
                marker_color='lightslategrey',
                name='25DPI'
                ))


fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
'title': 'DE Genes Across Time',
'yaxis':{'title':"number of genes"

    },
'xaxis':{'title':"cell type"

}})


fig.show()


fig.write_image("Figure_5B.svg")

In [None]:
# Figure 5C #


CELL_TYPES = [
'Pericytes',
'Endothelial Cells',
 'VLMCs',
'Vascular SMCs',
 'Excitatory Neurons',
 'Inhibitory Neurons',
'Myoc+ Astrocytes',
'Myoc- Astrocytes',
 'OPCs',
'Committed Oligodendrocytes',
 'Mature Oligodendrocytes',
'Microglia',
'Perivascular Macrophages']


threedpi_de_df = []
twentyfivedpi_de_df = []

threedpi_de_genes = {}
twentyfivedpi_de_genes = {}

threedpi_df_full = []
twentyfivedpi_df_full = []

for cell_type in CELL_TYPE_NAME_LABELS:
    
    three_de_temp = {}
    twentyfive_de_temp = {}
       
    for condition in ["3DPI", "25DPI"]:
        if condition == "3DPI":
            file_name = "deseq2_out/%s_%s_deseq2.csv" % (cell_type, condition)
            de_df = pandas.read_csv(file_name, index_col=0)
            
            
            de_genes = de_df[de_df['padj']<0.05]['Gene Name'].values
            
            for gene in de_genes:
                three_de_temp[gene] = de_df[de_df['Gene Name']==gene]['stat'].values[0]
            threedpi_de_df.append(three_de_temp)
            
            threedpi_df_full.append(de_df)
                
        elif condition == "25DPI":
            file_name = "deseq2_out/%s_%s_deseq2.csv" % (cell_type, condition)
            de_df = pandas.read_csv(file_name, index_col=0)
            
            de_genes = de_df[de_df['padj']<0.05]['Gene Name'].values
            
            for gene in de_genes:
                twentyfive_de_temp[gene] = de_df[de_df['Gene Name']==gene]['stat'].values[0]
            
            twentyfivedpi_de_df.append(twentyfive_de_temp)
            
            twentyfivedpi_df_full.append(de_df)

In [None]:
all_pvals = []
all_padjs = []
cell_types = []
genes = []
conditions = []
up_or_downs = []

for cell_type in CELL_TYPES:
    
    for condition in ["3DPI", "25DPI"]:
        df = pandas.read_csv("/home/tdobreva/aavomics/examples/figures/deseq2_out/%s_%s_deseq2.csv" % (cell_type, condition))
        pvals = df[~df["pvalue"].isna()]["pvalue"].values
        all_pvals.extend(pvals)
        cell_types.extend([cell_type] * len(pvals))
        genes.extend(df[~df["pvalue"].isna()]["Gene Name"])
        conditions.extend([condition] * len(pvals))
        up_or_downs.extend(["Up" if x > 0 else "Down" for x in df[~df["pvalue"].isna()]["stat"].values])
        
trues, padjusteds, thresh1, thresh2 = multipletests(all_pvals, method="fdr_bh")
significant_genes_df = pandas.DataFrame.from_records(zip(numpy.array(genes)[trues], numpy.array(cell_types)[trues], numpy.array(conditions)[trues], numpy.array(up_or_downs)[trues]))

# Genes kept after multiple test
threedpi_multipletest_genes = []
twentyfivedpi_multipletest_genes = []
for cell in CELL_TYPES:
    current_threedpi_gene_list = list(significant_genes_df[significant_genes_df[1]==cell][significant_genes_df[2] == "3DPI"][0])
    threedpi_multipletest_genes.append(current_threedpi_gene_list)
    current_twentyfivedpi_gene_list = list(significant_genes_df[significant_genes_df[1]==cell][significant_genes_df[2] == "25DPI"][0])
    twentyfivedpi_multipletest_genes.append(current_twentyfivedpi_gene_list)

In [None]:
for cell_of_interest in CELL_TYPES:
    print(cell_of_interest)
    cell_index = CELL_TYPES.index(cell_of_interest)
    
    

    threedpi_cell_de_genes, threedpi_lfcs = get_score_sorted_genes(threedpi_de_df[cell_index])
    twentyfivedpi_cell_de_genes,twentyfivedpi_lfcs = get_score_sorted_genes(twentyfivedpi_de_df[cell_index])


    print(threedpi_cell_de_genes)
    common_de_genes = list(set(threedpi_cell_de_genes).intersection(set(twentyfivedpi_cell_de_genes)))

    current_cell_mt_genes_threedpi = threedpi_multipletest_genes[CELL_TYPES.index(cell_of_interest)]
    current_cell_mt_genes_twentyfivedpi = twentyfivedpi_multipletest_genes[CELL_TYPES.index(cell_of_interest)]

    threedpi_cell_de_genes = [item for item in threedpi_cell_de_genes if item not in common_de_genes]
    threedpi_cell_de_genes = [item for item in threedpi_cell_de_genes if item in current_cell_mt_genes_threedpi]
    

    twentyfivedpi_cell_de_genes = [item for item in twentyfivedpi_cell_de_genes if item not in common_de_genes]
    twentyfivedpi_cell_de_genes = [item for item in twentyfivedpi_cell_de_genes if item in current_cell_mt_genes_twentyfivedpi]
    


    common_de_genes_dict = {}

    for gene in common_de_genes:
        current_threedpi_stat = get_stats_for_gene([gene],threedpi_df_full[cell_index])
        current_twentyfivedpi_stat = get_stats_for_gene([gene],twentyfivedpi_df_full[cell_index])
        avg_stat = numpy.mean([current_threedpi_stat,current_twentyfivedpi_stat])
        common_de_genes_dict[gene] = avg_stat


    common_de_genes_sorted,stats = get_score_sorted_genes(common_de_genes_dict)

    print(twentyfivedpi_cell_de_genes)

    ordered_gene_list = twentyfivedpi_cell_de_genes
    ordered_gene_list.extend(common_de_genes_sorted)
    ordered_gene_list.extend(threedpi_cell_de_genes)



    geneses= []
    for gene in ordered_gene_list:
        if gene=='mNeonGreen':
            continue
        else:
            geneses.append(gene)
    ordered_gene_list = geneses




    threedpi_lfcs = get_stats_for_gene(ordered_gene_list,threedpi_df_full[cell_index])
    twentyfivedpi_lfcs = get_stats_for_gene(ordered_gene_list,twentyfivedpi_df_full[cell_index])




    x_axis_labels = ['3DPI','25DPI']

    combined_score_matrix = numpy.column_stack((threedpi_lfcs, twentyfivedpi_lfcs))    


    figr = go.Figure(data=go.Heatmap(z=combined_score_matrix,x=x_axis_labels,y=ordered_gene_list,hoverongaps=False,zmid=0,colorscale = 'RdBu',zmin=-20,zmax=20))
    figr.update_layout(
        autosize=False,
        width=250,
        height=1000

    )
    figr.show()

    save_title=cell_of_interest+"_time_heatmap"

    figr.write_image('out/'+save_title+'.svg')
