In [None]:
import numpy as np
import scanpy as sc
import seaborn as sns
import os
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import sys
import pickle as pkl
path_helper = ["C:\\","Users","vfriedrich","projects","monkey_IZI","git_documentation","scRNAseq_cross_species_primate_human","analysis","helper"]
sys.path.append(os.path.join(*path_helper))
import helperVDF as h
#import decoupler
print(sys.executable)
import gseapy
from gseapy.plot import barplot, dotplot
warnings.filterwarnings("ignore")
import re
from matplotlib.colors import Normalize

In [None]:
pre = "MH128"
drive = 'F'
base_model_path,base_table_path,base_plots_path,base_anndata_objects = h.return_local_paths(drive = drive,
                                                                                            pre = pre,
                                                                                            add_path = True)
def remove_go(term):
    # Use regular expression to find and replace GO terms, ensuring the string is treated as a raw string
    new_term = re.sub(r'\s*\(GO:\d+\)', '', term)  # Corrected regex finds the GO term pattern and removes it
    return new_term.strip() 

def intersect_over_union(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

col_human =  "#e35e28"
col_cyno ="#31c7ba"
_,base_table_path_MH115,_,_ = h.return_local_paths(drive = drive,pre = 'MH115',add_path = True)

In [None]:
df_all_pathways = pd.read_csv(os.path.join(base_table_path_MH115,'MH115_all_df_pathways_.csv'),index_col=0)

In [None]:
conditions  = ['timepoints06hr','timepoints24hr']
celltypes_cyno = ['CD4_T', 'CD14_Mono', 'NKProliferating', 'CD8_T', 'B', 'MAIT', 'dnT']
celltypes_human = ['CD4_T', 'CD14_Mono', 'NKProliferating', 'CD8_T', 'B', 'MAIT']
celltypes_both = list(set(celltypes_human)&set(celltypes_cyno))

In [None]:
foreground_dfs = []
for celltype in celltypes_both:
    for condition in conditions:
        df_filt_cyno = df_all_pathways[(df_all_pathways['species'] == 'cyno')&(df_all_pathways['celltype'] == celltype)&(df_all_pathways['comparison.vs.00hr'] == condition)]
        df_filt_human = df_all_pathways[(df_all_pathways['species'] == 'human')&(df_all_pathways['celltype'] == celltype)&(df_all_pathways['comparison.vs.00hr'] == condition)]
        list_coenriched_cyno_human = list(set(list(df_filt_human['Term']))&set(list(df_filt_cyno['Term'])))
        if len(list_coenriched_cyno_human) > 0:
            df_filt_cyno = df_filt_cyno[df_filt_cyno['Term'].isin(list_coenriched_cyno_human)]
            df_filt_human = df_filt_human[df_filt_human['Term'].isin(list_coenriched_cyno_human)]
            df_filt_cyno.rename(columns={'Genes': 'Genes_cyno', 'Foreground Count': 'Foreground Count_cyno'}, inplace=True)
            df_filt_human.rename(columns={'Genes': 'Genes_human', 'Foreground Count': 'Foreground Count_human'}, inplace=True)
            
            df_foreground_ct = pd.merge(df_filt_cyno[['Term','Genes_cyno','Foreground Count_cyno']],df_filt_human[['Term','Genes_human','Foreground Count_human']],
                     on='Term')
            df_foreground_ct['Genes_cyno'] = df_foreground_ct['Genes_cyno'].str.split(';')
            df_foreground_ct['Genes_human'] = df_foreground_ct['Genes_human'].str.split(';')
            
            df_foreground_ct['IoU'] = df_foreground_ct.apply(lambda row: intersect_over_union(set(row['Genes_cyno']), set(row['Genes_human'])), axis=1)
            
            df_foreground_ct['celltype'] = celltype
            df_foreground_ct['condition'] = condition[-4:]
            foreground_dfs.append(df_foreground_ct)

In [None]:
foreground_dfs_all = pd.concat(foreground_dfs)

In [None]:
foreground_dfs_all.to_csv(os.path.join(base_table_path,pre + '_foreground_dfs_all.csv'))

In [None]:
condition_order = ['06hr', '24hr']
celltype_order = ['NKProliferating','B','CD4_T', 'MAIT','CD14_Mono','CD8_T']

In [None]:
plt.figure(figsize=(10, 6))
ax =sns.boxplot(x='celltype', y='IoU',hue='condition', hue_order=condition_order,data=foreground_dfs_all,boxprops=dict(alpha=.7),palette="tab10",order=celltype_order)  
sns.stripplot(x='celltype', y='IoU',hue='condition', data=foreground_dfs_all,hue_order=condition_order, jitter=True, size=3.5, alpha=0.9,dodge=True,order=celltype_order) 
plt.title('Jaccard Index for foreground gene sets in co-enriched pathways')
plt.ylabel('Jaccard Index')
plt.grid(False)
handles, labels = ax.get_legend_handles_labels()
legend = ax.legend(handles[:2], labels[:2], title='Condition', loc='upper left', bbox_to_anchor=(1, 1))  
plt.savefig(os.path.join(base_plots_path,pre + '_boxplot_Jaccard_Index_coenriched_pathways.pdf'),bbox_inches = 'tight') 
plt.show()

## Comparison with VAE

In [None]:
_,base_table_path_MH123,_,_ = h.return_local_paths(drive = drive,pre = 'MH123',add_path = True)

In [None]:
df_pathway_comp_VAE_DGE = pd.read_csv(os.path.join(base_table_path_MH123,'MH123_df_pathway_comp_VAE_DGE.csv'),index_col=0)

#### cyno 06hr & VAE

In [None]:
df_VAE_06 = df_pathway_comp_VAE_DGE[df_pathway_comp_VAE_DGE['Genes_cyno_06hr'].notna()][['Term','Genes_VAE','Genes_cyno_06hr']]

df_VAE_06['IoU'] = df_VAE_06.apply(lambda row: intersect_over_union(set(row['Genes_VAE']), set(row['Genes_cyno_06hr'])), axis=1)

np.median(df_VAE_06['IoU'])

In [None]:
df_VAE_06

#### cyno 24hr & VAE

In [None]:
df_VAE_24 = df_pathway_comp_VAE_DGE[df_pathway_comp_VAE_DGE['Genes_cyno_24hr'].notna()][['Term','Genes_VAE','Genes_cyno_24hr']]

df_VAE_24['IoU'] = df_VAE_24.apply(lambda row: intersect_over_union(set(row['Genes_VAE']), set(row['Genes_cyno_24hr'])), axis=1)

np.median(df_VAE_24['IoU'])

In [None]:
df_VAE_24

#### human 06hr & VAE

In [None]:
df_VAE_06_human = df_pathway_comp_VAE_DGE[df_pathway_comp_VAE_DGE['Genes_human_06hr'].notna()][['Term','Genes_VAE','Genes_human_06hr']]

df_VAE_06_human['IoU'] = df_VAE_06_human.apply(lambda row: intersect_over_union(set(row['Genes_VAE']), set(row['Genes_human_06hr'])), axis=1)

np.median(df_VAE_06_human['IoU'])

In [None]:
df_VAE_06_human

#### human 24hr & VAE

In [None]:
df_VAE_24_human = df_pathway_comp_VAE_DGE[df_pathway_comp_VAE_DGE['Genes_human_24hr'].notna()][['Term','Genes_VAE','Genes_human_24hr']]

df_VAE_24_human['IoU'] = df_VAE_24_human.apply(lambda row: intersect_over_union(set(row['Genes_VAE']), set(row['Genes_human_24hr'])), axis=1)

np.median(df_VAE_24_human['IoU'])

In [None]:
df_VAE_24_human

save session 

In [None]:
base_package_version_path = h.return_package_version_local_path(drive=drive)
h.save_package_versions(base_package_version_path,pre,do_print = True)
h.print_main_versions()