In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina' 
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import gseapy as gp
from matplotlib import pyplot as plt
from tqdm import tqdm
import pickle
from gseapy import barplot, dotplot
import numbers

In [None]:
inputExcelPath = "sigX_data.xlsx"

rawUpDF = pd.read_excel(inputExcelPath, sheet_name = "Increased in PAOSX")[["Locus tag", "Gene name", "Fold change", "T statistic", "P-value"]]
rawDownDF = pd.read_excel(inputExcelPath, sheet_name = "Decreased in PAOSX")[["Locus tag", "Gene name", "Fold change", "T statistic", "P-value"]]

backgroundPAO1 = pd.read_csv("PAO1_Conversion_df.csv")["Locus Tag"].to_list()

In [None]:
with open("pae_kegg_pathways.gmt", "r") as f:
    paths = f.read()
paths = paths.split(sep = "\n")

def filterPathwayLength(inputPaths, minLength):
    assert isinstance(minLength, int), "minLength must be an integer"
    assert minLength > 0, "minLength must be greater than 0"

    outPaths = []
    
    for path in inputPaths:
        spath = path.split("\t")[2:]
        if len(spath) >= minLength:
            outPaths.append(path)
    with open("tmp.gmt", "w") as w:
        for path in outPaths:
            w.write(path + "\n")

filterPathwayLength(paths, 10)

In [None]:
enr = gp.enrich(
    gene_list = rawUpDF["Locus tag"].to_list(),
    gene_sets = "tmp.gmt",
    background = backgroundPAO1,
    outdir = None
    )
# gseapy is just a wrapper for Enrichr, which yields this output:
# p = p-value computed using the Fisher exact test (Hypergeometric test)
# z = z-score (Odds Ratio)
# combine score = - log(p)·z        

enrOut = enr.results

with open('pae_descriptions.pkl', 'rb') as f:
    pathway2desc = pickle.load(f)
    
enrOut['PathName'] = enrOut['Term'].map(pathway2desc)
enrOut["Combined Score"] = enrOut["Combined Score"].apply(lambda x: np.log10(x) if x > 0 else 0)
enrOut["Term"] = enrOut["PathName"]

In [None]:
def filterSortTruncateEnrOut(enrichOutput = enrOut, filterBy = 'Adjusted P-value', filterMax = 0.05, sortBy = 'Adjusted P-value', numOut = 20):
    assert isinstance(enrichOutput, pd.DataFrame), "enrichOutput must be a DataFrame"
    assert filterBy in ["Adjusted P-value", "Combined Score", "P-value", "Odds Ratio"], "invalid filterBy input"
    assert sortBy in ["Adjusted P-value", "Combined Score", "P-value", "Odds Ratio"], "invalid sortBy input"
    assert isinstance(filterMax, numbers.Number), "filterMax must be numerical"
    assert isinstance(numOut, int), "numOut must be an integer"

    sigOut = enrichOutput.copy().loc[enrichOutput[filterBy] <= filterMax].sort_values(by = sortBy).head(numOut)
    return sigOut

sigOut = filterSortTruncateEnrOut(sortBy = "Combined Score")

In [None]:
ax = dotplot(sigOut,
             hue = "Adjusted P-value",
             y_order = sigOut["Term"].tolist(),
             x = None,
             top_term = 20,
             figsize = (5,15),
             xticklabels_rot = 45,
             show_ring = True,
             marker = 'o'
             )
ax.set_xlabel("log10(Combined Score)")

In [None]:
ax = barplot(sigOut,
             column = "Adjusted P-value",
             top_term = 10,
             figsize=(10,10),
             hue = "Overlap"
             )