In [None]:
import numpy as np
import scanpy as sc
import seaborn as sns
import os
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import sys
import pickle as pkl
path_helper = ["C:\\","Users","vfriedrich","projects","monkey_IZI","git_documentation","scRNAseq_cross_species_primate_human","analysis","helper"]
sys.path.append(os.path.join(*path_helper))
import helperVDF as h
#import decoupler
print(sys.executable)
import gseapy
from gseapy.plot import barplot, dotplot
warnings.filterwarnings("ignore")
import re

In [None]:
pre = "MH113"
drive = 'F'
base_model_path,base_table_path,base_plots_path,base_anndata_objects = h.return_local_paths(drive = drive,
                                                                                            pre = pre,
                                                                                            add_path = True)

### Human

In [None]:
def make_df_annotation_azimut1_5_scanvi_v2(df):
    df['cluster_azimut1_5_scanvi_v2'] = df['cluster_azimut1_5_scanvi'].replace(
    {'NK': 'NK+Proliferating', 'NK Proliferating': 'NK+Proliferating'})
    return df

def make_overview_from_rank_gene_group(adata,cluster_of_interest,species):
    dge_results = adata.uns['rank_genes_groups']
    pvals_adj = dge_results['pvals_adj'][cluster_of_interest]
    pvals = dge_results['pvals'][cluster_of_interest]
    logfoldchanges = dge_results['logfoldchanges'][cluster_of_interest]
    scores = dge_results['scores'][cluster_of_interest]
    gene_names = dge_results['names'][cluster_of_interest]
    df_gene_ranks = pd.DataFrame([pvals_adj,pvals,logfoldchanges,scores,gene_names],
                                 index = ['pvals_adj','pvals','logfoldchanges','scores','gene_names']).T
    df_gene_ranks['scvi_cluster'] = cluster_of_interest
    df_gene_ranks['species'] = species
    df_gene_ranks['pvals_adj_0.05'] = df_gene_ranks['pvals_adj'] <= 0.05
    df_gene_ranks['pvals_adj_0.2'] = df_gene_ranks['pvals_adj'] <= 0.2
    df_gene_ranks = df_gene_ranks.sort_values(by='pvals_adj',ascending = True)
    return df_gene_ranks

def remove_go(term):
    # Use regular expression to find and replace GO terms, ensuring the string is treated as a raw string
    new_term = re.sub(r'\s*\(GO:\d+\)', '', term)  # Corrected regex finds the GO term pattern and removes it
    return new_term.strip() 

def make_enrichment_enrichr_human_multidatabase(
    adata_human,celltype,celltype_col,cluster_of_interest,species = 'human',nr_pathways_to_show = 15
):
    adata_human_ct = h.filter_adata_obs(adata_human,col_name=celltype_col,val=celltype)
    adata_human_ct.obs['scvi_clusters'] = adata_human_ct.obs['scvi_clusters'].astype('category')
    
    sc.tl.rank_genes_groups(
        adata_human_ct,
        groupby='scvi_clusters',
        reference="rest",
        method="wilcoxon"
    )
    df_gene_ranks = make_overview_from_rank_gene_group(adata=adata_human_ct,
                                                       cluster_of_interest = cluster_of_interest,
                                                       species = 'human')
    df_gene_ranks.to_csv(os.path.join(base_table_path,pre + '_df_gene_ranks_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))

    df_gene_ranks_filtered = df_gene_ranks[df_gene_ranks['pvals_adj'] <= 0.05]

    df_gene_ranks_filtered['abs_logfoldchanges'] =  np.abs(df_gene_ranks['logfoldchanges']) 
    
    df_gene_ranks_filtered = df_gene_ranks_filtered.sort_values(by='abs_logfoldchanges',ascending = False)
    df_gene_ranks_filtered.to_csv(os.path.join(base_table_path,pre + '_df_gene_ranks_filtered_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    top_200_genes = df_gene_ranks_filtered.head(200)['gene_names'].tolist()

    enrichment = gseapy.enrichr(gene_list=top_200_genes,
     gene_sets=['Reactome_Pathways_2024','WikiPathway_2023_Human','GO_Biological_Process_2023','KEGG_2021_Human','KEGG_2016'],
     organism='Human', 
     cutoff=0.05
     )

    df = enrichment.results
    df.to_csv(os.path.join(base_table_path,pre + '_df_pathways_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    filtered_df = df[df['Adjusted P-value'] < 0.05]
    filtered_df['Foreground Count'] = filtered_df['Overlap'].apply(lambda x: int(x.split('/')[0]))
    filtered_df = filtered_df[filtered_df['Foreground Count'] >= 2]
    filtered_df = filtered_df.sort_values(by='Odds Ratio', ascending=False)
    filtered_df['Log Odds Ratio'] = np.log(filtered_df['Odds Ratio'])
    filtered_df = filtered_df.drop_duplicates(subset=['Genes'], keep='first')
    filtered_df['short_term'] = filtered_df['Term'].apply(remove_go)
    filtered_df.to_csv(os.path.join(base_table_path,pre + '_filtered_df_pathways_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))

    plt.figure(figsize=(10, 8))  # Größe des Plots anpassen
    plt.barh(filtered_df.head(nr_pathways_to_show)['short_term'], filtered_df.head(nr_pathways_to_show)['Log Odds Ratio'], color="#574571")  # horizontales Balkendiagramm
    plt.xlabel('Log Odds Ratio')
    plt.yticks(ticks=range(nr_pathways_to_show), labels=filtered_df.head(nr_pathways_to_show)['short_term'], fontweight='bold')
    # Beschriftung der X-Achse
    plt.title(str(species) + ' - ' + str(celltype) + ' - cluster ' +str(cluster_of_interest))  # Titel des Plots
    plt.gca().invert_yaxis()  
    plt.grid(True, linestyle='--', alpha=0.2)
    plt.tight_layout()
    plt.savefig(
        os.path.join(base_plots_path, pre +'_' +str(species) + '_' + str(celltype) + '_' + str(cluster_of_interest) + '.pdf'),
        dpi=300, bbox_inches='tight')
    plt.show()  

In [None]:
path_clusteranno_s0140 = 'F:\\monkey_IZI\\analysis\\tables\\s0140_clusteranno_scvi.txt.gz'
clusteranno_s0140 = pd.read_csv(path_clusteranno_s0140,compression = 'gzip',index_col=0)

In [None]:
species = 'human'
_,_,_,base_anndata_objects_H24 = h.return_local_paths(drive,pre='H24',add_path = False)
adata_human = sc.read_h5ad(os.path.join(base_anndata_objects_H24,'H24' + '_' + species + '_anno_celltypes_v0.h5ad'))
adata_human.obs = pd.merge(adata_human.obs, clusteranno_s0140, left_index=True, right_index=True)
adata_human.obs = make_df_annotation_azimut1_5_scanvi_v2(adata_human.obs)
adata_human.obs.rename(columns={"timepoint_x": "timepoint", "individual_x": "individual","species_x" : "species"}, inplace=True)

In [None]:
celltype_col ='cluster_azimut1_5_scanvi_v2'

In [None]:
sc.pp.normalize_total(adata_human)
sc.pp.log1p(adata_human)

In [None]:

cluster_of_interest = '10'
species = 'human'


### Human CD8 T
- cluster 10

In [None]:
species = 'human'
celltype = 'CD8 T'
cluster_of_interest = '10'
celltype_col ='cluster_azimut1_5_scanvi_v2'

In [None]:
make_enrichment_enrichr_human_multidatabase(adata_human=adata_human,
                                            celltype=celltype,
                                            celltype_col=celltype_col,
                                            cluster_of_interest=cluster_of_interest)

### Human NK+Proliferating
- cluster 13

In [None]:
celltype = 'NK+Proliferating'
cluster_of_interest = '13'
species = 'human'
celltype_col ='cluster_azimut1_5_scanvi_v2'

In [None]:
make_enrichment_enrichr_human_multidatabase(adata_human=adata_human,
                                            celltype=celltype,
                                            celltype_col=celltype_col,
                                            cluster_of_interest=cluster_of_interest)

### Cynomolgus monkey

In [None]:
species = 'cyno'
_,_,_,base_anndata_objects_M24 = h.return_local_paths(drive,pre='M24',add_path = False)
adata_cyno = sc.read_h5ad(os.path.join(base_anndata_objects_M24,'M24' + '_' + species + '_anno_celltypes_v0.h5ad'))
adata_cyno.obs = pd.merge(adata_cyno.obs, clusteranno_s0140, left_index=True, right_index=True)
adata_cyno.obs = make_df_annotation_azimut1_5_scanvi_v2(adata_cyno.obs)
adata_cyno.obs.rename(columns={"timepoint_x": "timepoint", "individual_x": "individual","species_x" : "species"}, inplace=True)

In [None]:
sc.pp.normalize_total(adata_cyno)
sc.pp.log1p(adata_cyno)

In [None]:
def make_enrichment_enrichr_cyno_multidatabase(
    adata_cyno,celltype,celltype_col,cluster_of_interest,species = 'cyno',nr_pathways_to_show = 15
):
    adata_cyno_ct = h.filter_adata_obs(adata_cyno,col_name=celltype_col,val=celltype)
    adata_cyno_ct.obs['scvi_clusters'] = adata_cyno_ct.obs['scvi_clusters'].astype('category')
    
    sc.tl.rank_genes_groups(
        adata_cyno_ct,
        groupby='scvi_clusters',
        reference="rest",
        method="wilcoxon"
    )
    
    df_gene_ranks = make_overview_from_rank_gene_group(adata=adata_cyno_ct,
                                                       cluster_of_interest = cluster_of_interest,
                                                       species = 'cyno')
    df_gene_ranks.to_csv(os.path.join(base_table_path,pre + '_df_gene_ranks_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    
    
    ortho_tab_s0120_nm_path = os.path.join(*["C:\\","Users","vfriedrich","projects","monkey_IZI","git_documentation","scRNAseq_cross_species_primate_human","analysisR",
     "results_GIT","s0120_cyno2human_n_to_m_orthologues.csv.gz"])
    df_ortho_tab_s0120_nm = pd.read_csv(ortho_tab_s0120_nm_path, compression='gzip')
    
    df_gene_ranks_duplicates = pd.DataFrame(columns= df_gene_ranks.columns)
    for cyno_gene in df_gene_ranks['gene_names']:
        tab = df_ortho_tab_s0120_nm[df_ortho_tab_s0120_nm['seurat_gene'] == cyno_gene]
        gene_idx = df_gene_ranks['gene_names'].tolist().index(cyno_gene)
        #print(gene_idx)
        human_gene_names = tab[tab['Human gene name'].notna()]['Human gene name'].values
        for human_gene_name in list(human_gene_names):
            tab_exact = tab[tab['Human gene name'] == human_gene_name]
            #new_gene_name = (tab_exact['Human gene name'].values + '--' + tab_exact['seurat_gene'].values)[0] 
            if tab_exact['ortho_1to1_cyno2human'].values[0] == True:
                temp_list = df_gene_ranks['gene_names'].tolist()
                temp_list[gene_idx] = human_gene_name
                df_gene_ranks['gene_names'] = temp_list
            if tab_exact['ortho_1to1_cyno2human'].values[0] == False:
                row_to_add = df_gene_ranks[df_gene_ranks['gene_names'] == cyno_gene]
                row_to_add['gene_names'] = human_gene_name
                df_gene_ranks_duplicates = df_gene_ranks_duplicates.append(row_to_add)
    df_gene_ranks_humanized = df_gene_ranks.append(df_gene_ranks_duplicates)
    df_gene_ranks_humanized.to_csv(os.path.join(base_table_path,pre + '_df_gene_ranks_humanized_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    
    df_gene_ranks_filtered = df_gene_ranks_humanized[df_gene_ranks_humanized['pvals_adj'] <= 0.05]

    df_gene_ranks_filtered['abs_logfoldchanges'] =  np.abs(df_gene_ranks['logfoldchanges']) 
    
    df_gene_ranks_filtered = df_gene_ranks_filtered.sort_values(by='abs_logfoldchanges',ascending = False)
    df_gene_ranks_filtered.to_csv(os.path.join(base_table_path,pre + '_df_gene_ranks_filtered_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    top_200_genes = df_gene_ranks_filtered.head(200)['gene_names'].tolist()
    
    enrichment = gseapy.enrichr(gene_list=top_200_genes,
     gene_sets=['Reactome_Pathways_2024','WikiPathway_2023_Human','GO_Biological_Process_2023','KEGG_2021_Human','KEGG_2016'],
     organism='Human', 
     cutoff=0.05
     )
    
    df = enrichment.results
    df.to_csv(os.path.join(base_table_path,pre + '_df_pathways_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    filtered_df = df[df['Adjusted P-value'] < 0.05]
    filtered_df['Foreground Count'] = filtered_df['Overlap'].apply(lambda x: int(x.split('/')[0]))
    filtered_df = filtered_df[filtered_df['Foreground Count'] >= 2]
    filtered_df = filtered_df.sort_values(by='Odds Ratio', ascending=False)
    filtered_df['Log Odds Ratio'] = np.log(filtered_df['Odds Ratio'])
    filtered_df = filtered_df.drop_duplicates(subset=['Genes'], keep='first')
    filtered_df['short_term'] = filtered_df['Term'].apply(remove_go)
    filtered_df.to_csv(os.path.join(base_table_path,pre + '_filtered_df_pathways_' + str(celltype) + '_' + str(cluster_of_interest) + '_' +str(species) + '.csv'))
    
    plt.figure(figsize=(10, 8))  # Größe des Plots anpassen
    plt.barh(filtered_df.head(nr_pathways_to_show)['short_term'], filtered_df.head(nr_pathways_to_show)['Log Odds Ratio'], color="#574571")  # horizontales Balkendiagramm
    plt.xlabel('Log Odds Ratio')
    plt.yticks(ticks=range(nr_pathways_to_show), labels=filtered_df.head(nr_pathways_to_show)['short_term'], fontweight='bold')
    # Beschriftung der X-Achse
    plt.title(str(species) + ' - ' + str(celltype) + ' - cluster ' +str(cluster_of_interest))  # Titel des Plots
    plt.gca().invert_yaxis()  
    plt.grid(True, linestyle='--', alpha=0.2)
    plt.tight_layout()
    plt.savefig(
        os.path.join(base_plots_path, pre +'_' +str(species) + '_' + str(celltype) + '_' + str(cluster_of_interest) + '.pdf'),
        dpi=300, bbox_inches='tight')
    plt.show()  

## CD8 cyno
- cluster 10

In [None]:
species = 'cyno'
celltype = 'CD8 T'
cluster_of_interest = '10'
celltype_col ='cluster_azimut1_5_scanvi_v2'

In [None]:
make_enrichment_enrichr_cyno_multidatabase(adata_cyno=adata_cyno,
                                            celltype=celltype,
                                            celltype_col=celltype_col,
                                            cluster_of_interest=cluster_of_interest)

## NK 
- cluster 13

In [None]:
celltype = 'NK+Proliferating'
cluster_of_interest = '13'
species = 'cyno'
celltype_col ='cluster_azimut1_5_scanvi_v2'

In [None]:
make_enrichment_enrichr_cyno_multidatabase(adata_cyno=adata_cyno,
                                            celltype=celltype,
                                            celltype_col=celltype_col,
                                            cluster_of_interest=cluster_of_interest)

## CD16 Mono 
- cluster 21

In [None]:
celltype = 'CD16 Mono'
cluster_of_interest = '21'
species = 'cyno'
celltype_col ='cluster_azimut1_5_scanvi_v2'

In [None]:
make_enrichment_enrichr_cyno_multidatabase(adata_cyno=adata_cyno,
                                            celltype=celltype,
                                            celltype_col=celltype_col,
                                            cluster_of_interest=cluster_of_interest)

### Save session

In [None]:
base_package_version_path = h.return_package_version_local_path(drive=drive)
h.save_package_versions(base_package_version_path,pre,do_print = True)
h.print_main_versions()