# Pseudobulk DEA of spleen samples

In [None]:
import numpy as np
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.legend import Legend
import matplotlib.colors as colors
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import matplotlib.patches as patches
import pandas as pd
import scipy
import scanpy as sc
import anndata as ad

import random

from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

eps = 1e-300

In [None]:
sc.set_figure_params(scanpy=True, dpi=300, dpi_save=1200, frameon=True, vector_friendly=False, fontsize=14,
                         figsize=(9,8),  format='pdf', facecolor=None, transparent=False, ipython_format='png2x')

In [None]:
# adata = sc.read_h5ad("maranou_032024_spleen_annotated.h5ad")
adata = sc.read_h5ad("maranou_032024_spleen_annotated.h5ad")

In [None]:
# Based on DEA and marker genes form PanglaoDB, Tabula Muris and Cell Marker 2.0 (currently dysfunctional), Annotation of Cell Types: ACT
annotation_dict = {"0":"Naive B cells",#
                   "1":"Naive B cells",#
                   "2":"MZB and B-1 cells",#
                   "3":"Naive B cells",#
                   "4":"Activated B cells",#
                   "5":"Immature B cells",#
                   "6":"NK cells",#
                   "7":"CD8+",#
                   "8":"Treg",#
                   "9":"Naive B cells",#
                   "10":"CD4+",#
                   "11":"Monocytes and macrophages", #Classical monocytes here
                   "12":"MZB and B-1 cells",#  
                   # "13":"DC2",#
                    "dc2r0":"WDFY4+ cDC2",#
                   "dc2r1":"Relb(low) cDC2", #
                   "dc2r2":"Migratory cDC2",#
                   
                   
                   "14":"Tcm",#
                   "15":"Mature follicular B cells", #
                   "17":"Th",#
                   "18":"Monocytes and macrophages", #Nonclassical monocytes
                   "19":"Proliferative B cells",# Activated follicular B cells?
                   "20":"Lymphoid-resident cDC1",#
                   "21":"Heterogenous T cells",#
                   "22":"Neutrophils", #Activated neutrophils or myeloid-derived suppressor cells (MDSCs)
                   "24":"Mast cells",#
                   # "25":"CD8- DC1",#

                   # The two DC1 clusters are more clearly separated by CyC than Cd8a (which has varied expression in CyC(hi))
                   # Ref: The protease inhibitor cystatin C is differentially expressed among dendritic cell populations, but does not control antigen presentation
                    #El-Sukkari et al. J Immunol. 2003 Nov 15;171(10):5003-11.  doi: 10.4049/jimmunol.171.10.5003. 
                   "dc1r0":"CD8- CCR2+ cDC1",#
                   "dc1r1":"CD8(low) cDC1", #
                   "dc1r2":"CD8- CCR2- cDC1",#
                   
                   "26":"pDC",#
                   "27":"Activated T cells", #With B cell characteristics Cd79a, Pax5, and Ighd
                   "28":"Plasma cells",#
                   "r0":"Mixed DC",#
                   "r1":"Red pulp macrophages", #
                   "r2":"Relb(int.) cDC2",#
                   "r3":"Germinal center B cells",
                   "r4":"Plasma cells",#
                   "r5":"Treg"
                  }

ann_colors = plt.colormaps['tab20'].colors
ann_palette_all={"Germinal center B cells": ann_colors[18],
                       "Naive B cells": ann_colors[0],
                       "CD8+":ann_colors[4],
                       "Activated B cells": ann_colors[13],
                       "Immature B cells":plt.matplotlib.colors.to_rgb('dodgerblue'),
                       "Activated T cells":ann_colors[2],
                        "Tcm":plt.matplotlib.colors.to_rgb('lightseagreen'),
                       "CD4+":plt.matplotlib.colors.to_rgb('greenyellow'),
                       "Treg":ann_colors[16],
                         "Th":plt.matplotlib.colors.to_rgb('lime'),
                        "Red pulp macrophages":ann_colors[3],
                       "Heterogenous T cells":ann_colors[15],
                       "NK cells":ann_colors[17],
                       "Th cells":plt.matplotlib.colors.to_rgb('mediumseagreen'),
                       "MZB and B-1 cells":ann_colors[1],
                       "Mature follicular B cells":plt.matplotlib.colors.to_rgb('navy'),
                       "Mixed DC":ann_colors[5],
                       "CCR7+ DC1":ann_colors[12],
                       "pDC":ann_colors[6],
                       "Plasma cells":ann_colors[19],
                       "Monocytes and macrophages":ann_colors[7], #Probably Classical monocytes
                       "Lymphoid-resident cDC1":ann_colors[8], #XCR1+ [Gurka et al]
                        "CD8- CCR2+ cDC1":plt.matplotlib.colors.to_rgb('b'),#
                       "CD8(low) cDC1":ann_colors[0], #
                       "CD8- CCR2- cDC1":plt.matplotlib.colors.to_rgb('navy'),#
                        "Relb(int.) cDC2":ann_colors[13],#
                        "WDFY4+ cDC2":plt.matplotlib.colors.to_rgb('darkorchid'), #
                        "Relb(low) cDC2":plt.matplotlib.colors.to_rgb('firebrick'),#
                        "Relb(int.) cDC2":plt.matplotlib.colors.to_rgb('darkorchid'), 
                       "Migratory cDC2":plt.matplotlib.colors.to_rgb('cornflowerblue'), 
                       "Proliferative":plt.matplotlib.colors.to_rgb('b'),
                       "Mast cells":plt.matplotlib.colors.to_rgb('cornflowerblue'),
                       "Neutrophils":plt.matplotlib.colors.to_rgb('coral'),
                        "Proliferative B cells":plt.matplotlib.colors.to_rgb('cyan'),
                }

In [None]:
cell_types = list(adata.obs['cell_type_high_res'].unique())
sc.pl.umap(adata, color=['cell_type_high_res'],add_outline=True, palette=ann_palette_all, alpha=0.7, s=10, title='Spleen',legend_loc='on data',outline_width = (0.2,0.8),legend_fontsize=8, legend_fontweight='heavy')


# Define matplotlib Axes
# Number of Axes & plot size
ncols = 2
nrows = 2
figsize = 8
wspace = 0.1
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(ncols * figsize + figsize * wspace * (ncols - 1), nrows * figsize),
)
plt.subplots_adjust(wspace=wspace)

sc.pl.umap(adata[adata.obs['sample']=='wt_naive'], ax=axs[0,0], show=False, color=['cell_type_high_res'], title=['Spleen node, Naive WT'], add_outline=True, outline_width = (0.2,0.8), palette=ann_palette_all, alpha=0.7, s=10,legend_loc='on data',legend_fontsize=12, legend_fontweight='medium')
sc.pl.umap(adata[adata.obs['sample']=='wt_pathogenic'],  ax=axs[0,1], show=False, color=['cell_type_high_res'], title=['Spleen node, Pathogenic WT'],add_outline=True, outline_width = (0.2,0.8), palette=ann_palette_all, alpha=0.7, s=10,legend_loc='on data',legend_fontsize=12, legend_fontweight='medium')
sc.pl.umap(adata[adata.obs['sample']=='ko_naive'],  ax=axs[1,0],show=False, color=['cell_type_high_res'], title=['Spleen node, Naive Cd74 KO'], add_outline=True, outline_width = (0.2,0.8), palette=ann_palette_all, alpha=0.7, s=10,legend_loc='on data',legend_fontsize=12, legend_fontweight='medium')
sc.pl.umap(adata[adata.obs['sample']=='ko_pathogenic'], show=False, ax=axs[1,1],color=['cell_type_high_res'], title=['Spleen node, Pathogenic Cd74 KO'], add_outline=True, outline_width = (0.2,0.8), palette=ann_palette_all, alpha=0.7, s=10,legend_loc='on data',legend_fontsize=12, legend_fontweight='medium')

plt.tight_layout()
# plt.savefig('umap_spleen_annotations_samples.pdf',dpi=600)
plt.show()

## DC

In [None]:
dc_types = ["Lymphoid-resident cDC1","CD8(low) cDC1","CD8- CCR2+ cDC1","CD8- CCR2- cDC1", "CCR7+ DC1", 
                        "Relb(int.) cDC2","Migratory cDC2","WDFY4+ cDC2","Relb(low) cDC2","pDC","Mixed DC"]


In [None]:
adata_dc = adata[adata.obs['cell_type_high_res'].isin(dc_types)].copy()
adata_dc.obs['cell_type_high_res']=pd.Categorical(adata_dc.obs['cell_type_high_res']).remove_unused_categories()


In [None]:
print("X range:", adata_dc.obsm['X_umap'][:, 0].min(), "to", adata_dc.obsm['X_umap'][:, 0].max())
print("Y range:", adata_dc.obsm['X_umap'][:, 1].min(), "to", adata_dc.obsm['X_umap'][:, 1].max())

In [None]:
# adata_dc = adata[adata.obs['cell_type_high_res'].isin(dc_types)].copy()
# adata_dc.obs['cell_type_high_res']=pd.Categorical(adata_dc.obs['cell_type_high_res']).remove_unused_categories()

# Create the UMAP plot and capture the axis object that scanpy returns
fig, axes = plt.subplots(1,1)
sc.pl.umap(adata_dc,
                color=['cell_type_high_res'],
                add_outline=True,
                outline_width=(0.2, 0.5),
                palette=ann_palette_all,
                alpha=0.7,
                s=10,
                title='Spleen DCs',
                legend_loc='on data',
                legend_fontsize=12,
                legend_fontweight='medium',
                show=False,
                ax=axes,
                )  

# Set the axis limits on the returned axis object
axes.set_xlim(-3.5, 8.)  # Adjust these values
axes.set_ylim(-5.5, 5.0)  # Adjust these values

plt.savefig('umap_spleen_dc_annotations.pdf', bbox_inches='tight')
plt.show()

### 1. KO vs WT in pathogenic samples

In [None]:
# Make a new column named 'condition' for DEA
adata.obs['condition'] = adata.obs['sample']

In [None]:
# DEGs in aggregated DC

dc_types = ["Lymphoid-resident cDC1","CD8(low) cDC1","CD8- CCR2+ cDC1","CD8- CCR2- cDC1", "CCR7+ DC1", 
                        "Relb(int.) cDC2","Migratory cDC2","WDFY4+ cDC2","Relb(low) cDC2","pDC","Mixed DC"]
adata_subset = adata[adata.obs['cell_type_high_res'].isin(dc_types)].copy()

for n_repeat in np.arange(1,2):

    pseudodatas = []

    for sample in adata_subset.obs['sample'].unique():
    
        adata_sample = adata_subset[adata_subset.obs['sample']==sample]
        adata_sample.X = adata_sample.layers['counts']
        
        indices = list(adata_sample.obs_names)
        random.shuffle(indices)
    
        num_pseudo_samples = 4
        indices = np.array_split(np.array(indices), num_pseudo_samples)
        print('Number of cells in each sample:',sample,len(indices[0]))
        
        for i, pseudo_rep in enumerate(indices) :
            
            rep_adata =  sc.AnnData(X = adata_sample[indices[i]].X.sum(axis = 0),
                                       var = adata_sample[indices[i]].var[[]])
            rep_adata.obs_names = [str(sample)+'_'+str(i)]
            rep_adata.obs['condition'] = adata_sample.obs['condition'].iloc[0]
            rep_adata.obs['wt/ko'] = adata_sample.obs['WT/KO'].iloc[0]
            rep_adata.obs['pathogenicity'] = adata_sample.obs['pathogenicity'].iloc[0]
            rep_adata.obs['replicate'] = i
            
            pseudodatas.append(rep_adata)
    
    pseudodata = sc.concat(pseudodatas)
    
    counts = pd.DataFrame(pseudodata.X, columns=pseudodata.var_names, index=pseudodata.obs.index)
    
    # DeSeq2 uses underscore internally to separate conditions. Therefore we need to remove them from the condition names. 
    for i in range(pseudodata.obs.shape[0]):
        pseudodata.obs['condition'].values[i] = pseudodata.obs['condition'].values[i].replace('_', '')

    # Save counts and metadata for plotiing heatmaps
    counts.T.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/counts/pseudobulk_counts_spleen_aggregatedDC_'+str(n_repeat)+'.csv')
    metadata = pseudodata.obs
    metadata.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/counts/pseudobulk_metadata_spleen_aggregatedDC_'+str(n_repeat)+'.csv')
    
    dds = DeseqDataSet(counts = counts.astype(int),
                       metadata = pseudodata.obs,
                       design_factors=['condition'])
    
    # Filter out genes with zero counts
    sc.pp.filter_genes(dds, min_cells = 1)
    
    dds.deseq2()

    ## Statistics for pathogenic KO vs WT 
    stat_res_pat = DeseqStats(dds, contrast=['condition','kopathogenic','wtpathogenic'])
    stat_res_pat.summary()

    res_pat = stat_res_pat.results_df

    res_pat = res_pat[res_pat.baseMean>0.25]
    res_pat['log_pvalue'] = -np.log(eps+stat_res_pat.results_df['pvalue'])
    res_pat['log_padj'] = -np.log(eps+stat_res_pat.results_df['padj'])
    
    
    if n_repeat==1:
    
        de_kowt_pat = res_pat
    
    if n_repeat>1:

        # Average log of adjusted p-value
        de_kowt_pat.loc[np.intersect1d(de_kowt_pat.index, stat_res_pat.results_df.index)] = de_kowt_pat.loc[np.intersect1d(de_kowt_pat.index, stat_res_pat.results_df.index)]*(n_repeat-1)/n_repeat + res_pat/n_repeat
        
        #print(de_kowt_pat)

    ## Statistics for naive KO vs WT 
    stat_res_naive = DeseqStats(dds, contrast=['condition','konaive', 'wtnaive'])
    stat_res_naive.summary()

    res_naive = stat_res_naive.results_df

    res_naive = res_naive[res_naive.baseMean>0.25]
    res_naive['log_pvalue'] = -np.log(eps+stat_res_naive.results_df['pvalue'])
    res_naive['log_padj'] = -np.log(eps+stat_res_naive.results_df['padj'])
    
    if n_repeat==1:
    
        de_kowt_naive = res_naive
    
    if n_repeat>1:

        # Average log of adjusted p-value
        de_kowt_naive.loc[np.intersect1d(de_kowt_naive.index, stat_res_naive.results_df.index)] = de_kowt_naive.loc[np.intersect1d(de_kowt_naive.index, stat_res_naive.results_df.index)]*(n_repeat-1)/n_repeat + res_naive/n_repeat


    ## Statistics for WT pathogenic vs naive 
    stat_res_wt = DeseqStats(dds, contrast=['condition', 'wtpathogenic', 'wtnaive'])
    stat_res_wt.summary()

    res_wt = stat_res_wt.results_df

    print('results:')
    print(res_wt)

    res_wt = res_wt[res_wt.baseMean>0.25]
    res_wt['log_pvalue'] = -np.log(eps+stat_res_wt.results_df['pvalue'])
    res_wt['log_padj'] = -np.log(eps+stat_res_wt.results_df['padj'])
    
    if n_repeat==1:
    
        de_patnaive_wt = res_wt
    
    if n_repeat>1:

        # Average log of adjusted p-value
        de_patnaive_wt.loc[np.intersect1d(de_patnaive_wt.index, stat_res_wt.results_df.index)] = de_patnaive_wt.loc[np.intersect1d(de_patnaive_wt.index, stat_res_wt.results_df.index)]*(n_repeat-1)/n_repeat + res_wt/n_repeat


    ## Statistics for KO pathogenic vs naive 
    stat_res_ko = DeseqStats(dds, contrast=['condition', 'kopathogenic', 'konaive'])
    stat_res_ko.summary()

    res_ko = stat_res_ko.results_df

    print('results:')
    print(res_ko)

    res_ko = res_ko[res_ko.baseMean>0.25]
    res_ko['log_pvalue'] = -np.log(eps+stat_res_ko.results_df['pvalue'])
    res_ko['log_padj'] = -np.log(eps+stat_res_ko.results_df['padj'])
    
    if n_repeat==1:
    
        de_patnaive_ko = res_ko
    
    if n_repeat>1:

        # Average log of adjusted p-value
        de_patnaive_ko.loc[np.intersect1d(de_patnaive_ko.index, stat_res_ko.results_df.index)] = de_patnaive_ko.loc[np.intersect1d(de_patnaive_ko.index, stat_res_ko.results_df.index)]*(n_repeat-1)/n_repeat + res_ko/n_repeat

    

plt.figure(figsize=(6,4))
plt.scatter(de_kowt_pat['log2FoldChange'], de_kowt_pat['log_padj'] , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=12)

plt.axhline( -np.log(0.05) , linestyle='--', linewidth=0.8, alpha=0.7)

degs_kowt_pat = de_kowt_pat[(de_kowt_pat['log_padj'] > -np.log(eps+0.05))&(np.abs(de_kowt_pat['log2FoldChange'])>0.5)]
degs_kowt_naive = de_kowt_naive[(de_kowt_naive['log_padj'] > -np.log(eps+0.05))&(np.abs(de_kowt_naive['log2FoldChange'])>0.5)]
degs_patnaive_wt = de_patnaive_wt[(de_patnaive_wt['log_padj'] > -np.log(eps+0.05))&(np.abs(de_patnaive_wt['log2FoldChange'])>0.5)]
degs_patnaive_ko = de_patnaive_ko[(de_patnaive_ko['log_padj'] > -np.log(eps+0.05))&(np.abs(de_patnaive_ko['log2FoldChange'])>0.5)]

sig_logfoldchanges_kowt_pat = degs_kowt_pat['log2FoldChange']
sig_log_padj_kowt_pat = degs_kowt_pat['log_padj']
sig_var_names_kowt_pat = list(degs_kowt_pat.index)

for i, gene_name in enumerate(sig_var_names_kowt_pat):

    if gene_name!='Cd74':
    # print(i, gene_name)
        plt.text(sig_logfoldchanges_kowt_pat[i]+0.1, sig_log_padj_kowt_pat[i]+0.05, str(gene_name), fontsize=8, zorder=15)


if len(sig_var_names_kowt_pat)>0:
    
    plt.xlim([np.nanmin(de_kowt_pat.drop(['Cd74'])['log2FoldChange'])-0.5,np.nanmax(de_kowt_pat.drop(['Cd74'])['log2FoldChange'])+1.0])
    plt.ylim([-0.05*np.nanmax(sig_log_padj_kowt_pat[np.asarray(sig_var_names_kowt_pat)!='Cd74']),np.nanmax(sig_log_padj_kowt_pat[np.asarray(sig_var_names_kowt_pat)!='Cd74'])*1.05+0.5])

plt.title('KO vs. WT in pathogenic spleen samples, aggregated DCs', fontsize='small')

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_pathogenic_results/volcano_ko_vs_wt_spleen_pahogenic_aggregatedDC.pdf', dpi=600,bbox_inches = "tight")

plt.show()

top_genes_pat=np.asarray(sig_var_names_kowt_pat)[np.argsort(np.abs(sig_logfoldchanges_kowt_pat)+sig_log_padj_kowt_pat)[::-1]]

print('top_genes:',top_genes_pat )

degs_kowt_pat.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_pathogenic_results/Spleen_pat_kowt_degs_aggregatedDC.csv', index=True)
degs_kowt_naive.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_naive_results/Spleen_naive_kowt_degs_aggregatedDC.csv', index=True)
degs_patnaive_wt.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_wt_results/Spleen_wt_patnaive_degs_aggregatedDC.csv', index=True)
degs_patnaive_ko.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_ko_results/Spleen_ko_patnaive_degs_aggregatedDC.csv', index=True)


## Volcano plots and heatmaps

In [None]:
de_kowt_naive = pd.read_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_naive_results/Spleen_naive_kowt_degs_aggregatedDC.csv')
de_kowt_naive.index = de_kowt_naive["Unnamed: 0"]
de_kowt_pat = pd.read_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_pathogenic_results/Spleen_pat_kowt_degs_aggregatedDC.csv')
de_kowt_pat.index = de_kowt_pat["Unnamed: 0"]
de_patnaive_wt = pd.read_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_wt_results/Spleen_wt_patnaive_degs_aggregatedDC.csv')
de_patnaive_wt.index = de_patnaive_wt["Unnamed: 0"]
de_patnaive_ko = pd.read_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_ko_results/Spleen_ko_patnaive_degs_aggregatedDC.csv')
de_patnaive_ko.index = de_patnaive_ko["Unnamed: 0"]


In [None]:
sns.set_style("white")
plt.figure(figsize=(9,6))
plt.scatter(de_kowt_pat['log2FoldChange'], np.log(de_kowt_pat['log_padj']) , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=12)
# plt.scatter(de_kowt['log2FoldChange'], de_kowt['log_pvalue'] , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=12)

plt.axhline( np.log(-np.log(0.05)) , linestyle='--', linewidth=0.8, alpha=0.7)

degs_kowt_pat = de_kowt_pat[(de_kowt_pat['log_padj'] > -np.log(eps+0.05))&(np.abs(de_kowt_pat['log2FoldChange'])>0.5)]
degs_kowt_pat_up = degs_kowt_pat[degs_kowt_pat['log2FoldChange']>0.5]
degs_kowt_pat_down = degs_kowt_pat[degs_kowt_pat['log2FoldChange']<-0.5]

sig_logfoldchanges_kowt_pat = degs_kowt_pat['log2FoldChange']
sig_log_padj_kowt_pat = degs_kowt_pat['log_padj']
sig_var_names_kowt_pat = list(degs_kowt_pat.index)

top_ind = np.argsort(np.abs(sig_logfoldchanges_kowt_pat)+np.log(sig_log_padj_kowt_pat))[::-1]
top_names = np.asarray(sig_var_names_kowt_pat)[top_ind]

for i, gene_name in enumerate(top_names[0:200]):

    if gene_name!='Cd74':
    # print(i, gene_name)
        plt.text(sig_logfoldchanges_kowt_pat[top_ind[i]]+0.05, np.log(sig_log_padj_kowt_pat[top_ind[i]])+0.05, str(gene_name), fontsize=10, zorder=15)

    # if len(sig_var_names_ckowt_pat)>0:
        
    #     plt.xlim([np.nanmin(de_kowt_pat.drop(['Cd74'])['log2FoldChange'])-0.5,np.nanmax(de_kowt_pat.drop(['Cd74'])['log2FoldChange'])+1.0])
    #     plt.ylim([-0.05*np.nanmax(sig_log_padj_kowt_pat[np.asarray(sig_var_names_kowt_pat)!='Cd74']),np.nanmax(sig_log_padj_kowt_pat[np.asarray(sig_var_names_kowt_pat)!='Cd74'])*1.05+0.5])

select_names = ['Jun','Junb']
select_ind = []
[select_ind.append(np.where(np.array(sig_var_names_kowt_pat, dtype='str')==select_names[k])[0][0]) for k in np.arange(0,len(select_names))]

for i, gene_name in enumerate(select_names):
    
    plt.text(sig_logfoldchanges_kowt_pat[select_ind[i]]+0.05, np.log(sig_log_padj_kowt_pat[select_ind[i]])+0.05, str(gene_name), fontsize=10, zorder=15)

#Only Cd74 will be outaide thia window
plt.xlim(-5.5,7.5)
plt.ylim(1,5.75)
# plt.ylim([-0.2,np.max(sig_log_padj_kowt[np.asarray(sig_var_names_kowt)!='Cd74'])+0.5])
plt.title('KO vs. WT in pathogenic spleen samples, aggregated DC',fontsize=14)

plt.ylabel('log(-log p)', fontsize=13)
plt.xlabel('LFC', fontsize=13)

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_pathogenic_results/volcano_ko_vs_wt_spleen_pathogenic_aggregatedDC.pdf', dpi=600,bbox_inches = "tight")

plt.show()

top_genes_pat=np.asarray(sig_var_names_kowt_pat)[np.argsort(np.abs(sig_logfoldchanges_kowt_pat)+sig_log_padj_kowt_pat)[::-1]]

sig_var_names_kowt_pat_up = list(degs_kowt_pat_up.index)
sig_var_names_kowt_pat_down = list(degs_kowt_pat_down.index)

sig_logfoldchanges_kowt_pat_up = degs_kowt_pat_up['log2FoldChange']
sig_logfoldchanges_kowt_pat_down = degs_kowt_pat_down['log2FoldChange']

sig_log_padj_kowt_pat_up = degs_kowt_pat_up['log_padj']
sig_log_padj_kowt_pat_down = degs_kowt_pat_down['log_padj']

top_genes_pat_up=np.asarray(sig_var_names_kowt_pat_up)[np.argsort(np.abs(sig_logfoldchanges_kowt_pat_up)+sig_log_padj_kowt_pat_up)[::-1]]
top_genes_pat_down=np.asarray(sig_var_names_kowt_pat_down)[np.argsort(np.abs(sig_logfoldchanges_kowt_pat_down)+sig_log_padj_kowt_pat_down)[::-1]]

print('top genes up:',top_genes_pat_up )
print('top genes down:',top_genes_pat_down )

In [None]:
sns.set_style("white")
plt.figure(figsize=(6,4))

plt.scatter(de_kowt_naive['log2FoldChange'], np.log(de_kowt_naive['log_padj']) , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=12)

plt.axhline(np.log( -np.log(0.05)) , linestyle='--', linewidth=0.8, alpha=0.7)

degs_kowt_naive = de_kowt_naive[(de_kowt_naive['log_padj'] > -np.log(eps+0.05))&(np.abs(de_kowt_naive['log2FoldChange'])>0.5)]
degs_kowt_naive_up = degs_kowt_naive[degs_kowt_naive['log2FoldChange']>0.5]
degs_kowt_naive_down = degs_kowt_naive[degs_kowt_naive['log2FoldChange']<-0.5]

sig_logfoldchanges_kowt_naive = degs_kowt_naive['log2FoldChange']
sig_log_padj_kowt_naive = np.log(degs_kowt_naive['log_padj'])
sig_var_names_kowt_naive = list(degs_kowt_naive.index)

for i, gene_name in enumerate(sig_var_names_kowt_naive):

    # Leave out the KO gene and 'Tpm3rs7' (LFC approx 6 in this set)
    if (gene_name!='Cd74'):
    # print(i, gene_name)
        plt.text(sig_logfoldchanges_kowt_naive[i]+0.1, sig_log_padj_kowt_naive[i]+0.05, str(gene_name), fontsize=8, zorder=15)

    if len(sig_var_names_kowt_naive)>0:
        
        plt.xlim([np.nanmin(de_kowt_naive.drop(['Cd74'])['log2FoldChange'])-0.5,np.nanmax(de_kowt_naive.drop(['Cd74'])['log2FoldChange'])+1.0])
        plt.ylim([-0.05*np.nanmax(sig_log_padj_kowt_naive[np.asarray(sig_var_names_kowt_naive)!='Cd74']),np.nanmax(sig_log_padj_kowt_naive[np.asarray(sig_var_names_kowt_naive)!='Cd74'])*1.05+0.5])
    
plt.title('KO vs. WT in healthy mouse spleen', fontsize='small')

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_pathogenic_results/volcano_ko_vs_wt_spleen_naive_aggregatedDC.pdf', dpi=600,bbox_inches = "tight")

plt.ylim(1,4.25)
# plt.xlim(-2.5,3.)

plt.show()

sig_var_names_kowt_naive_up = list(degs_kowt_naive_up.index)
sig_var_names_kowt_naive_down = list(degs_kowt_naive_down.index)

sig_logfoldchanges_kowt_naive_up = degs_kowt_naive_up['log2FoldChange']
sig_logfoldchanges_kowt_naive_down = degs_kowt_naive_down['log2FoldChange']

sig_log_padj_kowt_naive_up = degs_kowt_naive_up['log_padj']
sig_log_padj_kowt_naive_down = degs_kowt_naive_down['log_padj']

top_genes_naive_up=np.asarray(sig_var_names_kowt_naive_up)[np.argsort(np.abs(sig_logfoldchanges_kowt_naive_up)+sig_log_padj_kowt_naive_up)[::-1]]
top_genes_naive_down=np.asarray(sig_var_names_kowt_naive_down)[np.argsort(np.abs(sig_logfoldchanges_kowt_naive_down)+sig_log_padj_kowt_naive_down)[::-1]]

print('top genes up:',top_genes_naive_up )
print('top genes down:',top_genes_naive_down )

top_genes_naive=np.asarray(sig_var_names_kowt_naive)[np.argsort(np.abs(sig_logfoldchanges_kowt_naive)+sig_log_padj_kowt_naive)[::-1]]


In [None]:
sns.set_style("white")
plt.figure(figsize=(6,4))

plt.scatter(de_patnaive_wt['log2FoldChange'], np.log(de_patnaive_wt['log_padj']) , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=12)

plt.axhline(np.log( -np.log(0.05)) , linestyle='--', linewidth=0.8, alpha=0.7)

degs_patnaive_wt = de_patnaive_wt[(de_patnaive_wt['log_padj'] > -np.log(eps+0.05))&(np.abs(de_patnaive_wt['log2FoldChange'])>0.5)]

degs_patnaive_wt_up = de_patnaive_wt[(de_patnaive_wt['log_padj'] > -np.log(eps+0.05))&(de_patnaive_wt['log2FoldChange']>0.5)]
degs_patnaive_wt_down = de_patnaive_wt[(de_patnaive_wt['log_padj'] > -np.log(eps+0.05))&(de_patnaive_wt['log2FoldChange']<-0.5)]

sig_logfoldchanges_patnaive_wt = degs_patnaive_wt['log2FoldChange']
sig_log_padj_patnaive_wt = degs_patnaive_wt['log_padj']
sig_var_names_patnaive_wt = list(degs_patnaive_wt.index)

for i, gene_name in enumerate(sig_var_names_patnaive_wt):

    plt.text(sig_logfoldchanges_patnaive_wt[i]+0.1, np.log(sig_log_padj_patnaive_wt[i])+0.05, str(gene_name), fontsize=8, zorder=15)

    # if len(sig_var_names_patnaive_wt)>0:
        
    #     plt.xlim([np.nanmin(de_patnaive_wt['log2FoldChange'])-0.5,np.nanmax(de_patnaive_wt['log2FoldChange'])+1.0])
    #     plt.ylim([-0.05*np.nanmax(sig_log_padj_patnaive_wt[np.asarray(sig_var_names_patnaive_wt)!='Cd74']),np.nanmax(sig_log_padj_patnaive_wt[np.asarray(sig_var_names_patnaive_wt)!='Cd74'])*1.05+0.5])
    
plt.title('Pathogenic vs. Naive, WT mouse spleen', fontsize='small')

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_wt_results/volcano_pat_vs_naive_spleen_wt_aggregatedDC.pdf', dpi=600,bbox_inches = "tight")

plt.ylim(1,4.5)
# plt.xlim(-2.5,3.)

plt.show()

sig_var_names_patnaive_wt_up = list(degs_patnaive_wt_up.index)
sig_var_names_patnaive_wt_down = list(degs_patnaive_wt_down.index)

sig_logfoldchanges_patnaive_wt_up = degs_patnaive_wt_up['log2FoldChange']
sig_logfoldchanges_patnaive_wt_down = degs_patnaive_wt_down['log2FoldChange']

sig_log_padj_patnaive_wt_up = degs_patnaive_wt_up['log_padj']
sig_log_padj_patnaive_wt_down = degs_patnaive_wt_down['log_padj']

top_genes_wt_up=np.asarray(sig_var_names_patnaive_wt_up)[np.argsort(np.abs(sig_logfoldchanges_patnaive_wt_up)+sig_log_padj_patnaive_wt_up)[::-1]]
top_genes_wt_down=np.asarray(sig_var_names_patnaive_wt_down)[np.argsort(np.abs(sig_logfoldchanges_patnaive_wt_down)+sig_log_padj_patnaive_wt_down)[::-1]]


print('top genes up:',top_genes_wt_up )
print('top genes down:',top_genes_wt_down )

In [None]:
sns.set_style("white")
plt.figure(figsize=(6,4))

plt.scatter(de_patnaive_ko['log2FoldChange'], np.log(de_patnaive_ko['log_padj']) , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=12)

plt.axhline(np.log( -np.log(0.05)) , linestyle='--', linewidth=0.8, alpha=0.7)

degs_patnaive_ko = de_patnaive_ko[(de_patnaive_ko['log_padj'] > -np.log(eps+0.05))&(np.abs(de_patnaive_ko['log2FoldChange'])>0.5)]

degs_patnaive_ko_up = de_patnaive_ko[(de_patnaive_ko['log_padj'] > -np.log(eps+0.05))&(de_patnaive_ko['log2FoldChange']>0.5)]
degs_patnaive_ko_down = de_patnaive_ko[(de_patnaive_ko['log_padj'] > -np.log(eps+0.05))&(de_patnaive_ko['log2FoldChange']<-0.5)]

sig_logfoldchanges_patnaive_ko = degs_patnaive_ko['log2FoldChange']
sig_log_padj_patnaive_ko = degs_patnaive_ko['log_padj']
sig_var_names_patnaive_ko = list(degs_patnaive_ko.index)

for i, gene_name in enumerate(sig_var_names_patnaive_ko):

    plt.text(sig_logfoldchanges_patnaive_ko[i]+0.1, np.log(sig_log_padj_patnaive_ko[i])+0.05, str(gene_name), fontsize=8, zorder=15)

    # if len(sig_var_names_patnaive_ko)>0:
        
    #     plt.xlim([np.nanmin(de_patnaive_ko['log2FoldChange'])-0.5,np.nanmax(de_patnaive_ko['log2FoldChange'])+1.0])
    #     plt.ylim([-0.05*np.nanmax(sig_log_padj_patnaive_ko[np.asarray(sig_var_names_patnaive_ko)!='Cd74']),np.nanmax(sig_log_padj_patnaive_ko[np.asarray(sig_var_names_patnaive_ko)!='Cd74'])*1.05+0.5])
    
plt.title('Pathogenic vs. Naive, WT mouse spleen', fontsize='small')

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_ko_results/volcano_pat_vs_naive_spleen_ko_aggregatedDC.pdf', dpi=600,bbox_inches = "tight")

plt.ylim(1,4.5)
# plt.xlim(-2.5,3.)

plt.show()

sig_var_names_patnaive_ko_up = list(degs_patnaive_ko_up.index)
sig_var_names_patnaive_ko_down = list(degs_patnaive_ko_down.index)

sig_logfoldchanges_patnaive_ko_up = degs_patnaive_ko_up['log2FoldChange']
sig_logfoldchanges_patnaive_ko_down = degs_patnaive_ko_down['log2FoldChange']

sig_log_padj_patnaive_ko_up = degs_patnaive_ko_up['log_padj']
sig_log_padj_patnaive_ko_down = degs_patnaive_ko_down['log_padj']

top_genes_ko_up=np.asarray(sig_var_names_patnaive_ko_up)[np.argsort(np.abs(sig_logfoldchanges_patnaive_ko_up)+sig_log_padj_patnaive_ko_up)[::-1]]
top_genes_ko_down=np.asarray(sig_var_names_patnaive_ko_down)[np.argsort(np.abs(sig_logfoldchanges_patnaive_ko_down)+sig_log_padj_patnaive_ko_down)[::-1]]


print('top genes up:',top_genes_ko_up )
print('top genes down:',top_genes_ko_down )

In [None]:
# Shared DEGs for wt_n -> ko_n and wt_pat -> ko_pat
# These genes react similarly to Cd74 KO in naive and pathogenic conditions (provided the change is in the same direction!)

print('Shared up:', len(list(set(top_genes_naive_up) & set(top_genes_pat_up))))
print(list(set(top_genes_naive_up) & set(top_genes_pat_up)))
print('Shared down:', len(list(set(top_genes_naive_down) & set(top_genes_pat_down))))
print(list(set(top_genes_naive_down) & set(top_genes_pat_down)))


In [None]:
# Shared DEGs for wt_n -> wt_p and wt_n -> ko_n
# These genes are markers that 'simulate' pathogenicity by Cd74 KO (provided the change is in the same direction!)
print('Shared up:',len(list(set(top_genes_naive_up) & set(top_genes_wt_up))))
print(list(set(top_genes_naive_up) & set(top_genes_wt_up)))
print('Shared down:',len(list(set(top_genes_naive_down) & set(top_genes_wt_down))))
print(list(set(top_genes_naive_down) & set(top_genes_wt_down)))


In [None]:
# Shared DEGs for wt_n -> wt_p and wt_pat -> ko_pat
print('Shared up:',len(list(set(top_genes_wt_up) & set(top_genes_pat_up))))
print(list(set(top_genes_wt_up) & set(top_genes_pat_up)))
print('Shared down:',len(list(set(top_genes_wt_down) & set(top_genes_pat_down))))
print(list(set(top_genes_wt_down) & set(top_genes_wt_down)))


### Heatmap of top DE genes in pathogenic samples

In [None]:
counts = pd.read_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/counts/pseudobulk_counts_spleen_aggregatedDC_1.csv')
counts = counts.rename(columns={"Unnamed: 0": "Gene"})
counts.index = counts['Gene']
counts = counts.drop('Gene', axis=1)

metadata = pd.read_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/counts/pseudobulk_metadata_spleen_aggregatedDC_1.csv')
metadata = metadata.rename(columns={"Unnamed: 0": "sample"})
metadata = metadata.rename(columns={"wt/ko": "genotype"})
metadata.index = metadata['sample']


In [None]:
dds = DeseqDataSet(counts = counts.astype(int).T,
                           metadata = metadata,
                           design='~genotype')
        
sc.pp.filter_genes(dds, min_cells = 1)
        
dds.deseq2()

In [None]:
#Apply a more strict p-value threshold for plotting 
degs = de_kowt_pat[(de_kowt_pat['log_padj'] > -np.log(eps+0.01))&(np.abs(de_kowt_pat['log2FoldChange'])>1.0)]

sig_logfoldchanges = degs['log2FoldChange']
sig_log_padj = degs['log_padj']
sig_var_names = list(degs.index)

dds.layers['log1p'] = np.log1p(dds.layers['normed_counts'])
dds_sig_pat = dds[:, sig_var_names_kowt_pat]

log1p_counts = pd.DataFrame(dds_sig_pat.layers['log1p'], index=dds_sig_pat.obs.index, columns=dds_sig_pat.var_names)

top_genes_pat=np.asarray(sig_var_names)[np.argsort(np.abs(sig_logfoldchanges)+sig_log_padj)[::-1]]

wt_pat_median = np.median(log1p_counts.loc[['wt_pathogenic_0','wt_pathogenic_1','wt_pathogenic_2','wt_pathogenic_3']],axis=0)
ko_pat_median = np.median(log1p_counts.loc[['ko_pathogenic_0','ko_pathogenic_1','ko_pathogenic_2','ko_pathogenic_3']],axis=0)

median_log1p_counts_pathogenic = pd.DataFrame([wt_pat_median,ko_pat_median], index=['WT', 'KO'], columns=dds_sig_pat.var_names)

sns.set_style("white")
plt.figure()


# Exclude Cd74 (KO) because it's on a different scale
g=sns.clustermap(median_log1p_counts_pathogenic[top_genes_pat[0:len(top_genes_pat)]].T,
               cmap='RdYlBu_r',
               yticklabels=True,
               xticklabels=True,
               col_cluster=False,
               row_cluster=True,
               cbar_pos=None,
               dendrogram_ratio=(.5, 0.),
               # cbar_pos=(0.02, .32, .03, .2)
               figsize = (4,18)
              )

g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_ymajorticklabels(), fontsize = 9)
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize = 12)


# Rotate labels for for landcape format
# g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_ymajorticklabels(), fontsize = 9, rotation=180)
# g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize = 12, rotation=90)


diff_ko_wt = median_log1p_counts_pathogenic.loc['KO'] - median_log1p_counts_pathogenic.loc['WT']

# Create a list of '+' and '-' signs based on the difference
signs_ko_wt = {gene: '+' if d > 0 else '-' for gene, d in diff_ko_wt[top_genes_pat].items()}

reordered_genes = g.dendrogram_row.reordered_ind

ax = g.ax_heatmap

# Add the signs on top of the heatmap
for idx, gene_idx in enumerate(reordered_genes):
# for idx, sign in enumerate(signs_ko):

    gene = top_genes_pat[gene_idx]
    sign_ko_wt = signs_ko_wt[gene]

    #Sign of ko-wt response difference
    rect = patches.Rectangle((0.9, idx), 0.2, 1, fill=True, facecolor='white', edgecolor='none', alpha=0.9)
    ax.add_patch(rect)
    ax.text(1, idx+0.5, sign_ko_wt, ha='center', va='center', fontweight='bold', fontsize=10)

# plt.title('Heatmap of Top Differentially Expressed Genes in Pathogenic Samples')
plt.tight_layout()

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_pathogenic_results/heatmap_ko_vs_wt_in_pathogenic_spleen_aggregatedDC.pdf', dpi=600,bbox_inches = "tight")

plt.show()

## Interaction analysis: How WT and KO respond to introduction of a pathogenic condition? 

In [None]:
#Import interaction DEA results generated in R

def read_results(filename):
    res = pd.read_csv(filename)
    res.set_index('gene', inplace=True)
    return res


In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
# Initialize dictionary to store results across repeats
all_results = {}
eps = 1e-300  # small constant to avoid log(0)

# Average stats over repeats
for n_repeat in np.arange(1, 6):
    # Read results for this repeat
    results_df = read_results('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/spleen_aggregatedDC_deseq2_interaction_results_'+str(n_repeat)+'.csv')

    # Filter by base mean
    res = results_df[results_df.baseMean > 0.25]
    
    # Store log of raw p-values (and other statistics)
    res['log_pvalue'] = -np.log(eps + res['pvalue'])
    
    if n_repeat == 1:
        # Initialize storage for all genes we'll see
        de_res = res.copy()
        # Initialize storage for running average of log p-values
        all_genes = set(res.index)
    else:
        # Update set of all genes we've seen
        all_genes.update(res.index)
    
    # Store results for each repeat
    for idx in res.index:
        if idx not in all_results:
            all_results[idx] = {
                'log_pvalue': [],
                'baseMean': [],
                'log2FoldChange': [],
                'lfcSE': [],
                'stat': []
            }
        
        all_results[idx]['log_pvalue'].append(res.loc[idx, 'log_pvalue'])
        all_results[idx]['baseMean'].append(res.loc[idx, 'baseMean'])
        all_results[idx]['log2FoldChange'].append(res.loc[idx, 'log2FoldChange'])
        all_results[idx]['lfcSE'].append(res.loc[idx, 'lfcSE'])
        all_results[idx]['stat'].append(res.loc[idx, 'stat'])

# Create final results dataframe
final_results = []
for gene in all_genes:
    if gene in all_results:
        # Calculate mean of log p-values
        mean_log_pvalue = np.mean(all_results[gene]['log_pvalue'])
        # Convert back to p-value
        pvalue = np.exp(-mean_log_pvalue)
        
        # Calculate means of other statistics
        row = {
            'gene': gene,
            'pvalue': pvalue,
            'baseMean': np.mean(all_results[gene]['baseMean']),
            'log2FoldChange': np.mean(all_results[gene]['log2FoldChange']),
            'lfcSE': np.mean(all_results[gene]['lfcSE']),
            'stat': np.mean(all_results[gene]['stat'])
        }
        final_results.append(row)

# Convert to dataframe
final_df = pd.DataFrame(final_results)
final_df.set_index('gene', inplace=True)

# Remove NaN values before running multipletests
valid_mask = ~np.isnan(final_df['pvalue'])
valid_pvals = final_df.loc[valid_mask, 'pvalue']

# Run multipletests only on valid p-values
adj_pvals = np.full(len(final_df), np.nan)
adj_pvals[valid_mask] = multipletests(valid_pvals.values, method='fdr_bh')[1]
final_df['padj'] = adj_pvals

# # Store log-transformed versions
final_df['log_pvalue'] = -np.log(eps + final_df['pvalue'])
final_df['log_padj'] = -np.log(eps + final_df['padj'])

In [None]:
final_df

In [None]:
de_res = final_df

sns.set_style("white")
plt.figure(figsize=(9,6))

# pat_intersection= de_kowt_pat.loc[de_kowt_pat.index.intersection(top_names)]

# colors = pat_intersection['log2FoldChange'][dDeg_inds]
# ringcolors = np.sign(colors-np.mean(colors))

# plt.scatter(sig_dLFC,np.log(-np.log(sig_dpadj)),c=ringcolors, cmap = 'coolwarm', zorder=5,alpha=1, s=15)
# plt.scatter(sig_dLFC,np.log(-np.log(sig_dpadj)),c=colors, cmap = 'coolwarm', zorder=10,alpha=1, s=12)

plt.scatter(de_res['log2FoldChange'], np.log(de_res['log_padj']) , facecolor='dodgerblue', edgecolor='blue',linewidths=0.5, zorder=10,alpha=0.7, s=14)

plt.axhline( np.log(-np.log(0.05)) , linestyle='--', linewidth=0.8, alpha=0.7)

degs = de_res[(de_res['log_padj'] > -np.log(eps+0.05))&(np.abs(de_res['log2FoldChange'])>0.5)]

sig_logfoldchanges = degs['log2FoldChange']
sig_log_padj = degs['log_padj']
sig_var_names = list(degs.index)

top_ind = np.argsort(np.abs(sig_logfoldchanges)+3*np.log(sig_log_padj))[::-1]
top_names = np.asarray(sig_var_names)[top_ind]

for i, gene_name in enumerate(top_names[0:100]):

    # if gene_name=='Cd86':
    # print(i, gene_name)
    plt.text(sig_logfoldchanges[top_ind[i]]+0.03, np.log(sig_log_padj[top_ind[i]])+0.05, str(gene_name), fontsize=10, zorder=15)

select_names = ['Fos','Jun','Junb']
select_ind = []
[select_ind.append(np.where(np.array(sig_var_names, dtype='str')==select_names[k])[0][0]) for k in np.arange(0,len(select_names))]

for i, gene_name in enumerate(select_names):
    
    plt.text(sig_logfoldchanges[select_ind[i]]+0.03, np.log(sig_log_padj[select_ind[i]])+0.05, str(gene_name), fontsize=10, zorder=15)


plt.xlim([-8,9])
plt.ylim([0.75,5])

plt.ylabel('log(-log p)', fontsize=13)
plt.xlabel('LFC', fontsize=13)

plt.title('Diff. response to pathogenicity in spleen aggregated DCs: KO vs WT', fontsize='small')

plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/volcano_kopat_vs_rest_pathogenic_spleen_aggregatedDC_interaction.pdf', dpi=600,bbox_inches = "tight")

plt.show()

top_genes=np.asarray(sig_var_names)[np.argsort(np.abs(sig_logfoldchanges)+sig_log_padj)[::-1]]

print('top_genes:',top_genes )

In [None]:
degs.iloc[np.where(degs.index=='Cd74')[0],:]

In [None]:
degs

In [None]:
# degs.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/Spleen_kopat_vs_rest_interaction_degs_CD8minusDC1.csv', index=True)
# degs.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/Spleen_kopat_vs_rest_interaction_degs_DC2.csv', index=True)
degs.to_csv('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/Spleen_kopat_vs_rest_interaction_degs_aggregatedDC.csv', index=True)

In [None]:

# Read the files made in R
raw_counts = pd.read_csv("/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/res_raw_counts.csv", index_col=0)
norm_counts = pd.read_csv("/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/res_normalized_counts.csv", index_col=0)
gene_info = pd.read_csv("/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/res_gene_info.csv", index_col=0)
sample_info = pd.read_csv("/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/res_sample_info.csv", index_col=0)

# Create AnnData object with raw counts as main matrix
dds_adata = ad.AnnData(
    X=raw_counts.T.values,
    var=gene_info,
    obs=sample_info
)

# Set the index names
dds_adata.var_names = raw_counts.index
dds_adata.obs_names = raw_counts.columns

# Add normalized counts in layers
dds_adata.layers['normalized'] = norm_counts.T.values

# Add size factors to obs
dds_adata.obs['size_factors'] = sample_info['sizeFactor']

# Store information about what's in the layers
dds_adata.uns['layer_description'] = {
    'X': 'raw_counts',
    'normalized': 'deseq2_normalized_counts'
}

# Verify the structure
print(dds_adata)
print("\nAvailable layers:", list(dds_adata.layers.keys()))
print("\nShape of raw counts:", dds_adata.X.shape)
print("Shape of normalized counts:", dds_adata.layers['normalized'].shape)


# Not run. We import dds from R with fully implemented interaction analysis
dds = DeseqDataSet(counts = counts.astype(int).T,
                           metadata = metadata,
                           design='~genotype + pathogenicity + genotype:pathogenicity')
        
sc.pp.filter_genes(dds, min_cells = 1)
        
dds.deseq2()

In [None]:
#Apply a more strict p-value threshold for plotting 
dds =dds_adata.copy()

# The same thresholds will be used in GOA
degs = de_res[(de_res['log_padj'] > -np.log(eps+0.05))&(np.abs(de_res['log2FoldChange'])>1.0)]

sig_logfoldchanges = degs['log2FoldChange']
sig_log_padj = degs['log_padj']
sig_var_names = list(degs.index)

top_genes=np.asarray(sig_var_names)[np.argsort(np.abs(sig_logfoldchanges)+sig_log_padj)[::-1]]

dds.layers['log1p'] = np.log1p(dds.layers['normalized'])
dds_sig = dds[:, sig_var_names]

log1p_counts = pd.DataFrame(dds_sig.layers['log1p'], index=dds_sig.obs.index, columns=dds_sig.var_names)

# conditions = ['wt_naive','wt_pathogenic','ko_naive','ko_pathogenic']  
conditions = ['ko_pathogenic','ko_naive','wt_pathogenic','wt_naive']  


#Construct a dataframe that will hold the mean expression levels of the filtered genes in various conditions
mean_log1p_counts = pd.DataFrame(index=conditions, columns=dds_sig.var_names, dtype='float')

for condition in conditions:
    # Get all rows that start with the current condition
    rows = [row for row in log1p_counts.index if row.startswith(condition)]
    
    # Calculate the mean of these rows
    condition_mean = np.nanmean(log1p_counts.loc[rows],axis=0)
    
    mean_log1p_counts.loc[condition] =  condition_mean.astype(float)


diff_ko = mean_log1p_counts.loc['ko_pathogenic'] - mean_log1p_counts.loc['ko_naive']
diff_wt = mean_log1p_counts.loc['wt_pathogenic'] - mean_log1p_counts.loc['wt_naive']
ddiff_ko_wt = diff_ko - diff_wt

plot_genes = top_genes

# Create a list of '+' and '-' signs based on the difference
signs_ko = {gene: '+' if d > 0 else '-' for gene, d in diff_ko[plot_genes].items()}
signs_wt = {gene: '+' if d > 0 else '-' for gene, d in diff_wt[plot_genes].items()}
signs_ko_wt = {gene: '+' if d > 0 else '-' for gene, d in ddiff_ko_wt[plot_genes].items()}


sns.set_style("white")
plt.figure(dpi=1200)

g=sns.clustermap(mean_log1p_counts[plot_genes].T,
               z_score=0,
               # standard_scale=0,  
               cmap='RdYlBu_r',
               yticklabels=True,
               xticklabels=['KO Pathogenic', 'KO Naive' ,'WT Pathogenic','WT Naive'],
               col_cluster=False,
               row_cluster=True,
               cbar_pos=None,
               dendrogram_ratio=(.3, 0.),
               tree_kws=dict(linewidths=1.0, colors=(0.2, 0.2, 0.4)),
               figsize = (8,20)
               # cbar_pos=(0.85, 0.8, 0.1, 0.05),
              )

g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_ymajorticklabels(), fontsize = 9, rotation=180)
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize = 9, rotation=180)

labels = g.ax_heatmap.yaxis.get_majorticklabels()

# plt.title('Top DEGs in tumor response difference: Spleen data')

# Get the reordered index of genes
reordered_genes = g.dendrogram_row.reordered_ind

ax = g.ax_heatmap

# Add the signs on top of the heatmap
for idx, gene_idx in enumerate(reordered_genes):
# for idx, sign in enumerate(signs_ko):

    gene = plot_genes[gene_idx]
    sign_ko = signs_ko[gene]
    sign_wt = signs_wt[gene]
    sign_ko_wt = signs_ko_wt[gene]

    #Sign of ko response
    rect = patches.Rectangle((0.925, idx), 0.15, 1, fill=True, facecolor='white', edgecolor='none', alpha=0.8)
    ax.add_patch(rect)
    if sign_ko=='+':
        ax.text(1.01, idx+0.5, sign_ko, ha='center', va='center', fontweight='medium', fontsize=10, rotation=90)
    else:
        ax.text(1.0, idx+0.5, sign_ko, ha='center', va='center', fontweight='medium', fontsize=10, rotation=90)

    
    #Sign of wt response
    rect = patches.Rectangle((2.925, idx), 0.15, 1, fill=True, facecolor='white', edgecolor='none', alpha=0.8)
    ax.add_patch(rect)
    if sign_wt=='+':
        ax.text(3.01, idx+0.5, sign_wt, ha='center', va='center', fontweight='medium', fontsize=10, rotation=90)
    else:
        ax.text(3.0, idx+0.5, sign_wt, ha='center', va='center', fontweight='medium', fontsize=10, rotation=90)

    #Sign of ko-wt response difference
    rect = patches.Rectangle((1.9, idx), 0.2, 1, fill=True, facecolor='white', edgecolor='none', alpha=0.9)
    ax.add_patch(rect)

    if sign_ko_wt=='+':
        ax.text(2.01, idx+0.5, sign_ko_wt, ha='center', va='center', fontweight='bold', fontsize=14, rotation=90)
    else:
        ax.text(2.0, idx+0.5, sign_ko_wt, ha='center', va='center', fontweight='bold', fontsize=14, rotation=90)


plt.tight_layout()
plt.savefig('/Users/oipulk/Documents/scRNASeq/data/Eleftheria_Maranou_Mar2024/analysis/figures/pseudobulk/spleen/deseq2_interaction_results/Spleen_DC_interactionDEA_heatmap.pdf', dpi=600,bbox_inches = "tight")

fig = plt.gcf()

# Add a colorbar
# cax = fig.add_axes([0.8, 0.8, 0.1, 0.1])  # [left, bottom, width, height]
# fig.colorbar(g.ax_heatmap.collections[0], cax=cax)

plt.show()

In [None]:
labels = [t.get_text() for t in g.ax_heatmap.yaxis.get_majorticklabels()]

In [None]:
[print(deg) for deg in labels]