# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [4]:
import numpy as np
import pandas as pd
import warnings
import pathlib
import os
import matplotlib.pyplot as plt
import seaborn as sns
import distinctipy
import matplotlib.colors as mcolors
import matplotlib.patheffects as pe
from plottable import ColumnDefinition, Table
from scipy.stats import mannwhitneyu
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
from plottable import ColumnDefinition, Table
from plottable.plots import bar
from plottable.cmap import normed_cmap
import copy
import plotly.graph_objects as go
from statsmodels.stats.multitest import multipletests
from sklearn.model_selection import StratifiedKFold
from scipy import stats
import gseapy as gp
from biothings_client import get_client
from collections import Counter
from tqdm import tqdm


def make_rgb_transparent(rgb, bg_rgb, alpha):
    return [alpha * c1 + (1 - alpha) * c2 for (c1, c2) in zip(rgb, bg_rgb)]


# Setup path

In [2]:
path = 'E:/YandexDisk/Work/pydnameth/draft/13_fmba_cvd_dnam/data/120_1'
pathlib.Path(f"{path}/gseapy").mkdir(parents=True, exist_ok=True)

# Load Human GSEA libs

In [None]:
gsea_libs = gp.get_library_name("Human")
df_gsea_libs = pd.DataFrame(index=gsea_libs)
df_gsea_libs.to_excel(f"{path}/gseapy/libraries.xlsx", index=True, index_label='libraries')

# Entrez gene name

In [5]:
genes_trgt = pd.read_csv(f"{path}/GSEA(ebayes)_group_genes_orgn_limma.csv", index_col=0).index.tolist()

mygene = get_client("gene")
df_genes = pd.DataFrame(index=genes_trgt)
df_genes['Entrez'] = df_genes.index.values

for gene in (pbar := tqdm(df_genes.index.values)):
    pbar.set_description(f"MyGene {gene}")
    df_query = mygene.query(gene, scopes='entrezgene', species='human', as_dataframe=True)
    if not df_query.empty:
        if gene not in set(df_query.loc[:, "symbol"].values):
            if len(set(df_query.loc[:, "symbol"].values)) == 1:
                df_genes.at[gene, 'Entrez'] = df_query["symbol"][0]

df_genes.to_excel(f"{path}/gseapy/df_genes.xlsx", index_label='Gene')

MyGene HOXA3: 100%|██████████| 304/304 [01:28<00:00,  3.42it/s]        


# Enrichr

In [6]:
genes_trgt = pd.read_excel(f"{path}/gseapy/df_genes.xlsx", index_col=0).index.tolist()
gsea_libs = pd.read_excel(f"{path}/gseapy/libraries.xlsx", index_col=0).index.tolist()

dfs_enrichr = []
for gsea_lib in (pbar := tqdm(gsea_libs)):
    pbar.set_description(f"Processing {gsea_lib}")
    enr = gp.enrichr(
        gene_list=list(genes_trgt),
        gene_sets=gsea_lib,
        organism='Human',
        outdir=None,
        cutoff=1.00,
        verbose=False,
        no_plot=True
    )
    dfs_enrichr.append(enr.results)
df_enrichr = pd.concat(dfs_enrichr, ignore_index=True)
df_enrichr.to_excel(f"{path}/gseapy/enrichr_full.xlsx", index=True)

Processing miRTarBase_2017: 100%|██████████| 223/223 [24:37<00:00,  6.62s/it]                                  


In [15]:
genes_trgt = pd.read_excel(f"{path}/gseapy/df_genes.xlsx", index_col=0).index.tolist()
gsea_libs = ['GO_Biological_Process_2025']

dfs_enrichr = []
for gsea_lib in (pbar := tqdm(gsea_libs)):
    pbar.set_description(f"Processing {gsea_lib}")
    enr = gp.enrichr(
        gene_list=list(genes_trgt),
        gene_sets=gsea_lib,
        organism='Human',
        outdir=None,
        cutoff=1.00,
        verbose=False,
        no_plot=True
    )
    dfs_enrichr.append(enr.results)
df_enrichr = pd.concat(dfs_enrichr, ignore_index=True)
df_enrichr.to_excel(f"{path}/gseapy/enrichr_target.xlsx", index=True)

Processing GO_Biological_Process_2025: 100%|██████████| 1/1 [00:04<00:00,  4.05s/it]


In [None]:
df_enrichr = pd.read_excel(f"{path}/gseapy/enrichr_full.xlsx", index_col=0)
gsea_libs_trgt = {
    'GO_Biological_Process_2025': 'GO Biological Process',
    'GO_Molecular_Function_2025': 'GO Molecular Function',
    'GO_Cellular_Component_2025': 'GO Cellular Component',
    'KEGG_2021_Human': 'KEGG'
}
gsea_libs_trgt_colors = {
    'GO Biological Process': 'crimson',
    'GO Molecular Function': 'dodgerblue',
    'GO Cellular Component': 'limegreen',
    'KEGG': 'gold'
}
df_enrichr = df_enrichr.loc[(df_enrichr['Gene_set'].isin(gsea_libs_trgt.keys())) & (df_enrichr['Adjusted P-value'] < 0.05)]
for trgt_lib_id, trgt_lib in enumerate(gsea_libs_trgt):
    df_enrichr.loc[df_enrichr['Gene_set'] == trgt_lib, 'Order'] = trgt_lib_id
df_enrichr.sort_values(['Order', 'Adjusted P-value'], ascending=[True, True], inplace=True)
df_enrichr['Gene_set'].replace(gsea_libs_trgt, inplace=True)
df_enrichr.rename(columns={'Gene_set': 'Gene Set'}, inplace=True)
df_enrichr[r"$-\log_{10}(\mathrm{p-value})$"] = -np.log10(df_enrichr['Adjusted P-value'].values)
df_enrichr[['Genes In', 'Genes Max']] = df_enrichr['Overlap'].str.split('/', expand=True)
df_enrichr['% in Gene Set'] = df_enrichr['Genes In'].values.astype(float) / df_enrichr['Genes Max'].values.astype(float) * 100