In [None]:
import numpy as np
import scanpy as sc
import seaborn as sns
import os
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import sys
path_helper = ["C:\\","Users","vfriedrich","projects","monkey_IZI","git_documentation","scRNAseq_cross_species_primate_human","analysis","helper"]
sys.path.append(os.path.join(*path_helper))
import helperVDF as h
import decoupler
print(sys.executable)

In [None]:
pre = "MH76"
drive = 'F'
base_model_path,base_table_path,base_plots_path,base_anndata_objects = h.return_local_paths(drive = drive,
                                                                                            pre = pre,
                                                                                            add_path = True)

### KEGG

In [None]:
#cyno
_,base_table_path_M71,_,_ = h.return_local_paths(drive = drive,pre = "M71",add_path = False)
df_dge_M = pd.read_csv(os.path.join(base_table_path_M71,'M71_dge_tab_edgeR.csv'),index_col=0)
df_dge_M['FDR_0.25'] = df_dge_M['FDR'] <= 0.25
df_dge_M['FDR_0.5'] = df_dge_M['FDR'] <= 0.5
df_dge_M['FDR_0.2'] = df_dge_M['FDR'] <= 0.2

#human
_,base_table_path_H71,_,_ = h.return_local_paths(drive = drive,pre = "H71",add_path = False)
df_dge_H = pd.read_csv(os.path.join(base_table_path_H71,'H71_dge_tab_edgeR.csv'),index_col=0)
df_dge_H['FDR_0.25'] = df_dge_H['FDR'] <= 0.25
df_dge_H['FDR_0.5'] = df_dge_H['FDR'] <= 0.5
df_dge_H['FDR_0.2'] = df_dge_H['FDR'] <= 0.2




db_collection = 'KEGG'

msigdb = decoupler.get_resource("MSigDB")
msigdb = msigdb[msigdb['collection']=='kegg_pathways']
msigdb = msigdb[~msigdb.duplicated(['geneset', 'genesymbol'])]

In [None]:
celltypes_M = list(pd.unique(df_dge_M['celltype']))
celltypes_H = list(pd.unique(df_dge_H['celltype']))
conditions = list(pd.unique(df_dge_M['comparison.vs.00hr']))

In [None]:
conditions

In [None]:
species = 'cyno'
for celltype in celltypes_M:
    for condition in conditions:
        df_dge_M_ct_cond = df_dge_M[(df_dge_M['celltype'] == celltype)&(df_dge_M['comparison.vs.00hr'] == condition)]
        df_dge_M_ct_cond_FDR = df_dge_M_ct_cond[df_dge_M_ct_cond['PValue'] <= 0.05] 
        df_dge_M_ct_cond_FDR = df_dge_M_ct_cond_FDR.set_index('genes')
        ora_df = decoupler.get_ora_df(
                df=df_dge_M_ct_cond_FDR,
                net=msigdb,
                source='geneset',
                target='genesymbol'
            )
        ora_df.to_csv(os.path.join(base_table_path,
                                 pre + '_' + str(species) + '_' +str(celltype) + '_' + str(condition)+ '_'+str(db_collection) + '_ora_df.csv')) 

In [None]:
species = 'human'
for celltype in celltypes_H:
    for condition in conditions:
        df_dge_H_ct_cond = df_dge_H[(df_dge_H['celltype'] == celltype)&(df_dge_H['comparison.vs.00hr'] == condition)]
        df_dge_H_ct_cond_FDR = df_dge_H_ct_cond[df_dge_H_ct_cond['PValue'] <= 0.05] 
        df_dge_H_ct_cond_FDR = df_dge_H_ct_cond_FDR.set_index('genes')
        ora_df = decoupler.get_ora_df(
                df=df_dge_H_ct_cond_FDR,
                net=msigdb,
                source='geneset',
                target='genesymbol'
            )
        ora_df.to_csv(os.path.join(base_table_path,
                                 pre + '_' + str(species) + '_' +str(celltype) + '_' + str(condition)+ '_'+str(db_collection) + '_ora_df.csv')) 

### Human

In [None]:
species = 'human'
conditions = ['timepoints06hr', 'timepoints24hr']
db_collection = 'KEGG'
human_KEGG_dict = {} 
for celltype in celltypes_H:
    condition_dict = {}
    for condition in conditions:
        KEGG_df=pd.read_csv(os.path.join(base_table_path,
                         pre + "_" +  str(species) + '_' + str(celltype) + '_' + str(condition)+ '_'+str(db_collection) + '_ora_df.csv'),
           index_col=0)

        condition_dict[condition] = list(KEGG_df.sort_values(by = 'Odds ratio',ascending=False).head(4).Term.values)
        condition_dict['KEGG_df'] = KEGG_df
    human_KEGG_dict[celltype] = condition_dict

### Cyno

In [None]:
species = 'cyno'
conditions = ['timepoints06hr', 'timepoints24hr']
db_collection = 'KEGG'
cyno_KEGG_dict = {} 
for celltype in celltypes_M:
    condition_dict = {}
    for condition in conditions:
        KEGG_df=pd.read_csv(os.path.join(base_table_path,
                         pre + "_" +  str(species) + '_' + str(celltype) + '_' + str(condition)+ '_'+str(db_collection) + '_ora_df.csv'),
           index_col=0)

        condition_dict[condition] = list(KEGG_df.sort_values(by = 'Odds ratio',ascending=False).head(4).Term.values)
        condition_dict['KEGG_df'] = KEGG_df
    cyno_KEGG_dict[celltype] = condition_dict

### Top pathways

In [None]:
pathways = []
for celltype in list(set(celltypes_M) & set(celltypes_H)):
    cond_dict = {}
    for condition in conditions:
        cyno_top_pathways = list(cyno_KEGG_dict[celltype][condition])
        human_top_pathways = list(human_KEGG_dict[celltype][condition])
        
        pathways = list(set(pathways).union(set(cyno_top_pathways).union(set(human_top_pathways))))

In [None]:
warnings.simplefilter("ignore")
KEGG_top_pathway_dict = {}
for celltype in list(set(celltypes_M) & set(celltypes_H)):
    cond_dict = {}
    for condition in conditions:
        KEGG_df_human = human_KEGG_dict[celltype]['KEGG_df']
        KEGG_df_cyno = cyno_KEGG_dict[celltype]['KEGG_df']
        
        KEGG_df_cyno_top_pathways = KEGG_df_cyno[KEGG_df_cyno['Term'].isin(pathways)]
        KEGG_df_cyno_top_pathways['species'] = 'cyno'
        KEGG_df_cyno_top_pathways['species+celltype+condition'] = 'cyno_' + celltype + '_' + condition[-4:]
        
        KEGG_df_human_top_pathways = KEGG_df_human[KEGG_df_human['Term'].isin(pathways)]
        KEGG_df_human_top_pathways['species'] = 'human'
        KEGG_df_human_top_pathways['species+celltype+condition'] = 'human_' + celltype + '_' + condition[-4:]
        
        KEGG_top_pathways =KEGG_df_human_top_pathways.append(KEGG_df_cyno_top_pathways)
        cond_dict[condition] = KEGG_top_pathways
    KEGG_top_pathway_dict[celltype] = cond_dict

In [None]:
columns_df = ['Term', 'Set size', 'Overlap ratio', 'p-value', 'FDR p-value',
       'Odds ratio', 'Combined score', 'Features', 'species',
       'species+celltype+condition']

In [None]:
df_KEGG_plot = pd.DataFrame(columns=columns_df)

for celltype in ['CD4_T','CD8_T','B']:
    for condition in conditions:
        df_KEGG_plot=df_KEGG_plot.append(KEGG_top_pathway_dict[celltype][condition])

df_KEGG_plot.to_csv(os.path.join(base_table_path,pre + '_df_KEGG_plot.csv'))

In [None]:
plt.figure(figsize=(10, 12))
scatter = sns.scatterplot(
    x='species+celltype+condition',
    y='Term',
    size='Odds ratio',
    hue='FDR p-value',
    data=df_KEGG_plot,
    palette='Reds_r',
    sizes=(100, 800),
    alpha=1,
    edgecolor='black',
    legend=True
)
plt.xticks(rotation=90)
ax = plt.gca()
ax.legend(bbox_to_anchor=(1.2, 1))
plt.savefig(os.path.join(base_plots_path,pre + '_scatter_DGE_KEGG.pdf'),bbox_inches = 'tight')
plt.show()

### Save session

In [None]:
base_package_version_path = h.return_package_version_local_path(drive=drive)
h.save_package_versions(base_package_version_path,pre,do_print = True)
h.print_main_versions()