In [1]:
from gseapy.parser import Biomart
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt
import os
import glob

In [2]:
# use biomart
# read the list of deseq genes
# use prerank
# plot gsea

In [3]:
# conditions of interest:
#     LPS0_shCTRL_vs_shMOF
#     LPS0_shCTRL_vs_shPRDX1
#     LPS3_shCTRL_vs_shMOF
#     LPS3_shCTRL_vs_shPRDX1
#     LPS12_shCTRL_vs_shMOF
#     LPS12_shCTRL_vs_shPRDX1

In [4]:
in_path = os.path.join("/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison")
out_path = os.path.join("/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea")

In [5]:
# From GSEApy developer:
# prerank is used for comparing two group of samples (e.g. control and treatment),
# where the gene ranking are defined by your custom rank method (like t-statistic, signal-to-noise, et.al).

In [6]:
# convert IDs using biomart
from gseapy.parser import Biomart
bm = Biomart()
marts = bm.get_marts()
datasets = bm.get_datasets(mart='ENSEMBL_MART_ENSEMBL')

In [7]:
genes_names = pd.read_csv(os.path.join(out_path, "gene_id_ensembl.tsv"), sep = "\t")
queries = genes_names["gene_id"].values.tolist() # need to be a python list
print(len(queries))
results = bm.query(dataset='mmusculus_gene_ensembl',
                   attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id'],
                   filters={'ensembl_gene_id': queries})

47937


In [8]:
for time in ["lps0", "lps3","lps12"]:
    for file in glob.glob(os.path.join(in_path, "ddr_"+time+"*_shctrl.tsv")):
        name = os.path.basename(file)
        name = name.split(".tsv")[0].split("ddr_")[-1]
        print(name)
        df = pd.read_csv(file, sep = "\t", usecols = ["GeneID", "log2FoldChange"])
        df.sort_values(by=["log2FoldChange"], inplace = True, ascending = False)
        df['ensembl_gene_id'] = df['GeneID'].str.split('.', 1).str[0]
        merged_df = df.merge(results, on = "ensembl_gene_id", how = "inner")
        print(merged_df.head())
        df1 = pd.DataFrame()
        df1[0] = merged_df["external_gene_name"].str.upper()
        df1[1] = merged_df["log2FoldChange"]
        df1.dropna(inplace = True)
        df1.sort_values(by=[1], ascending = False, inplace = True)
        pre_res = gp.prerank(rnk=df1, gene_sets='KEGG_2019_Mouse',
                             processes=4,
                             seed=149,
                             graph_num = 25,
                             permutation_num=100,
                             outdir=os.path.join(out_path,'test_prerank_report_kegg_2019_'+name))

2022-05-16 18:24:50,697 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps0_shprdx1_lps0_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0  ENSMUSG00000052516.17        4.167885  ENSMUSG00000052516   
1   ENSMUSG00000026535.9        4.162830  ENSMUSG00000026535   
2  ENSMUSG00000029275.17        3.991026  ENSMUSG00000029275   
3   ENSMUSG00000092021.8        3.577217  ENSMUSG00000092021   
4  ENSMUSG00000015053.14        3.480622  ENSMUSG00000015053   

  external_gene_name  entrezgene_id  
0              Robo2         268902  
1            Ifi202b          26388  
2               Gfi1          14581  
3              Gbp11           <NA>  
4              Gata2          14461  


2022-05-16 18:25:41,190 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps0_shmof_lps0_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0  ENSMUSG00000052516.17        4.542760  ENSMUSG00000052516   
1   ENSMUSG00000023886.9        3.128205  ENSMUSG00000023886   
2  ENSMUSG00000041782.14        2.962411  ENSMUSG00000041782   
3   ENSMUSG00000038201.9        2.589884  ENSMUSG00000038201   
4  ENSMUSG00000015053.14        2.578129  ENSMUSG00000015053   

  external_gene_name  entrezgene_id  
0              Robo2         268902  
1              Smoc2          64074  
2               Lad1          16763  
3              Kcna7          16495  
4              Gata2          14461  


2022-05-16 18:26:31,763 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps3_shprdx1_lps3_shctrl
                 GeneID  log2FoldChange     ensembl_gene_id  \
0  ENSMUSG00000092837.1        4.957045  ENSMUSG00000092837   
1  ENSMUSG00000088088.1        4.923361  ENSMUSG00000088088   
2  ENSMUSG00000026535.9        3.782115  ENSMUSG00000026535   
3  ENSMUSG00000097848.1        3.200252  ENSMUSG00000097848   
4  ENSMUSG00000079138.3        3.097919  ENSMUSG00000079138   

  external_gene_name  entrezgene_id  
0              Rpph1          85029  
1               Rmrp          19782  
2            Ifi202b          26388  
3              Gm807         328320  
4             Gm8818           <NA>  


2022-05-16 18:27:22,278 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps3_shmof_lps3_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0   ENSMUSG00000082560.1        3.174656  ENSMUSG00000082560   
1  ENSMUSG00000070868.11        2.834957  ENSMUSG00000070868   
2   ENSMUSG00000052415.5        2.519693  ENSMUSG00000052415   
3   ENSMUSG00000022123.8        2.507119  ENSMUSG00000022123   
4   ENSMUSG00000070720.3        2.502026  ENSMUSG00000070720   

  external_gene_name  entrezgene_id  
0            Gm15157           <NA>  
1             Skint3         195564  
2               Tchh          99681  
3               Scel          64929  
4           Tmem200b         623230  


2022-05-16 18:28:21,134 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps3_shctrl_lps0_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0  ENSMUSG00000004296.14       11.608734  ENSMUSG00000004296   
1   ENSMUSG00000026582.6       11.218868  ENSMUSG00000026582   
2   ENSMUSG00000018916.5       11.131849  ENSMUSG00000018916   
3  ENSMUSG00000041782.14       11.025293  ENSMUSG00000041782   
4  ENSMUSG00000025746.11       10.988077  ENSMUSG00000025746   

  external_gene_name  entrezgene_id  
0              Il12b           <NA>  
1               Sele          20339  
2               Csf2          12981  
3               Lad1          16763  
4                Il6          16193  


2022-05-16 18:29:16,356 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps12_shctrl_lps3_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0   ENSMUSG00000029084.5        4.771642  ENSMUSG00000029084   
1   ENSMUSG00000095620.7        4.561110  ENSMUSG00000095620   
2   ENSMUSG00000019987.8        4.496170  ENSMUSG00000019987   
3   ENSMUSG00000051748.2        4.416183  ENSMUSG00000051748   
4  ENSMUSG00000064065.15        4.243571  ENSMUSG00000064065   

  external_gene_name  entrezgene_id  
0               Cd38           <NA>  
1              Csta2          76770  
2               Arg1          11846  
3             Wfdc21          66107  
4             Ipcef1         320495  


2022-05-16 18:30:08,717 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps12_shmof_lps12_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0  ENSMUSG00000059336.14        5.357557  ENSMUSG00000059336   
1  ENSMUSG00000048031.15        4.128252  ENSMUSG00000048031   
2  ENSMUSG00000035275.14        3.633860  ENSMUSG00000035275   
3   ENSMUSG00000039883.5        3.442256  ENSMUSG00000039883   
4  ENSMUSG00000025815.13        3.374905  ENSMUSG00000025815   

  external_gene_name  entrezgene_id  
0            Slc14a1         108052  
1              Fcrl5         329693  
2             Raver2         242570  
3             Lrrc17          74511  
4             Dhtkd1         209692  


2022-05-16 18:31:00,785 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps12_shctrl_lps0_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0  ENSMUSG00000026822.14       12.905310  ENSMUSG00000026822   
1   ENSMUSG00000040026.7       11.539613  ENSMUSG00000040026   
2   ENSMUSG00000029084.5       10.931293  ENSMUSG00000029084   
3  ENSMUSG00000024743.14       10.791555  ENSMUSG00000024743   
4  ENSMUSG00000041782.14       10.579538  ENSMUSG00000041782   

  external_gene_name  entrezgene_id  
0               Lcn2          16819  
1               Saa3          20210  
2               Cd38           <NA>  
3               Syt7          54525  
4               Lad1          16763  


2022-05-16 18:31:52,143 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


lps12_shprdx1_lps12_shctrl
                  GeneID  log2FoldChange     ensembl_gene_id  \
0   ENSMUSG00000091556.8        5.051291  ENSMUSG00000091556   
1   ENSMUSG00000097558.1        5.030853  ENSMUSG00000097558   
2   ENSMUSG00000082560.1        4.027485  ENSMUSG00000082560   
3  ENSMUSG00000041592.16        3.909540  ENSMUSG00000041592   
4  ENSMUSG00000034683.12        3.798470  ENSMUSG00000034683   

  external_gene_name  entrezgene_id  
0            Gm14569      101055983  
1            Gm26902           <NA>  
2            Gm15157           <NA>  
3               Sdk2         237979  
4            Ppp1r1c          75276  


In [9]:
# look for common pathways within a time point

In [10]:
for time in ["lps0", "lps3", "lps12"]:
    merged_df = pd.DataFrame()
    for index, file in enumerate(glob.glob(os.path.join(out_path, "test_prerank_*"+time+"*", "*csv"))):
        print(file)
        df = pd.read_csv(file, sep =",", usecols = ["Term", "nes", "pval", "fdr"])
        print(len(df))
        df = df.loc[df["fdr"] <= 0.25] # recommneded by developer!
        print(len(df))
        if index == 0:
            merged_df = df
        else:
            merged_df = merged_df.merge(df, how = "inner", on = "Term")
    print(merged_df)

/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea/test_prerank_report_kegg_2019_lps3_shctrl_lps0_shctrl/gseapy.prerank.gene_sets.report.csv
270
127
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea/test_prerank_report_kegg_2019_lps0_shprdx1_lps0_shctrl/gseapy.prerank.gene_sets.report.csv
270
15
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea/test_prerank_report_kegg_2019_lps12_shctrl_lps0_shctrl/gseapy.prerank.gene_sets.report.csv
270
107
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea/test_prerank_report_kegg_2019_lps0_shmof_lps0_shctrl/gseapy.prerank.gene_sets.report.csv
270
22
Empty DataFrame
Columns: [Term, nes_x, pval_x, fdr_x, nes_y, pval_y, fdr_y, nes_x, pval_x, fdr_x, nes_y, pval_y, fdr_y]
Index: []
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea/test_prerank_report_kegg_2019_lps3_shctrl_lps0_shctrl/gseapy.prerank.gene_sets.report.csv
270


  return merge(
  return merge(
  return merge(


In [11]:
# filter those with FC>0.5 (FC<0.05)+padi<0.05
# gp.enrichr

In [12]:
up_dfs = dict()
down_dfs = dict()
for time in ["lps0", "lps3", "lps12"]:
    for cond in ["shmof", "shprdx1"]:
        for file in glob.glob(os.path.join(in_path, "ddr_"+time+"*"+cond+"*"+time+"_shctrl.tsv")):
            print(file)
            df = pd.read_csv(file, sep = "\t", usecols = ["GeneID", "log2FoldChange", "padj"])
            df = df.loc[df["padj"] < 0.05]
            df['ensembl_gene_id'] = df['GeneID'].str.split('.', 1).str[0]
            merged_df = df.merge(results, on = "ensembl_gene_id", how = "inner")
            df_up = merged_df.loc[merged_df["log2FoldChange"] > 0.05]
            df_down = merged_df.loc[merged_df["log2FoldChange"] < 0.05]
            if time not in up_dfs.keys():
                up_dfs[time] = dict()
                down_dfs[time] = dict()
            
            up_dfs[time][cond] = df_up
            down_dfs[time][cond] = df_down
print(down_dfs["lps0"]["shprdx1"])

/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/ddr_lps0_shmof_lps0_shctrl.tsv
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/ddr_lps0_shprdx1_lps0_shctrl.tsv
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/ddr_lps3_shmof_lps3_shctrl.tsv
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/ddr_lps3_shprdx1_lps3_shctrl.tsv
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/ddr_lps12_shmof_lps12_shctrl.tsv
/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/ddr_lps12_shprdx1_lps12_shctrl.tsv
                     GeneID  log2FoldChange          padj     ensembl_gene_id  \
1      ENSMUSG00000000056.7       -0.246523  1.016272e-03  ENSMUSG00000000056   
2      ENSMUSG00000000078.6       -0.249566  1.041826e-04  ENSMUSG00000000078   
3     ENSMUSG00000000085.16       -0.541236  8.327553e-11  ENSMUSG00000000085   
6     ENSMUSG00000000154.16       -0.435672  2.197982e-03  ENSMUSG00000000154   
8     ENSMUSG00

In [13]:
for time in ["lps0", "lps3", "lps12"]:
    for cond, df in up_dfs[time].items():
        enr = gp.enrichr(gene_list=df["external_gene_name"].astype(str),
                          description=time+"_"+cond+"_up",
                          gene_sets='KEGG_2019_Mouse',
                          background= 30000, # the number of genes, e.g 20000
                          outdir=os.path.join(out_path,'enricher_report_kegg_2019_'+time+"_"+cond+"_up"),
                          cutoff=0.05, # only used for testing.
                          organism='Mouse',
                          verbose=False)
#         dotplot(enr.res2d, title='KEGG_2019_Mouse',cmap='viridis_r')

In [14]:
# Could you help me to sort out up-regulated genes in TNF pathway in LPS3_shCTRL_vs_shMOF and 
# LPS3_shCTRL_vs_shPRDX1 and collect their log2FC from DESeq2 in a file?
gene_name_df = pd.read_csv("/data/akhtar/group2/rabbani/rna_project1904/pairwise_comparison/fdr0.05/gsea/kegg_name.tsv", sep ="\t")

In [15]:
# tnf path ledge_genes in mof (108 genes)
tnf_mof = ["CXCL10", "CXCL3", "PTGS2", "CXCL1", "IL6", "CSF2", "CXCL2", "TRAF1","MMP3", "CSF1", "VCAM1",
           "CCL20", "SELE", "TNFAIP3", "IL15", "CX3CL1", "LIF", "CCL2", "IFNB1", "IL1B", "MAP2K3", "MAPK13",
           "JAG1", "MAP2K6", "NOD2", "ICAM1", "RPS6KA5", "DAB2IP", "CREB3L2", "SOCS3", "MAPK12", "NFKB1",
           "TNFRSF1B", "FAS", "MAP2K4", "FOS", "NFKBIA", "CASP3", "IRF1", "JUNB", "TNF", "MAP2K1", "PIK3R3",
           "CREB5", "CREB3L4", "CFLAR", "BAG4", "MAP3K5", "BCL3", "IKBKB", "TAB3", "TRAF5", "RELA", "BIRC3",
           "TRAF2", "GM5431", "BIRC2", "PGAM5", "AKT3", "ITCH", "ATF4", "IKBKG", "PIK3R2", "ATF2", "CHUK",
           "CASP7", "PIK3R1", "TRAF3", "AKT1", "TRADD", "MAP3K7", "MAPK3", "MAP2K7", "CREB1", "CREB3L1",
           "MAPK8", "AKT2", "IFI47", "RIPK1", "CEBPB", "DNM1L", "MLKL", "PIK3CB", "MAPK1", "FADD", "LTA",
           "TAB1", "ATF6B", "MAP3K8", "CCL5", "PIK3CA", "RIPK3", "EDN1", "TAB2", "CASP8", "TNFRSF1A",
           "JUN", "CREB3", "MAPK11", "MAPK9", "MAP3K14", "MMP9", "IL18R1", "MAPK14", "CREB3L3", "PIK3CD",
           "CCL12", "MMP14"]
tnf_mof_ensemble = gene_name_df.loc[gene_name_df["external_gene_name"].str.upper().isin(tnf_mof)]
tnf_mof_ensemble =tnf_mof_ensemble[["ensembl_gene_id", "ensembl_gene_id_version", "external_gene_name"]]

In [16]:
# tnf path ledge_genes in prdx1 (108 genes)
tnf_prdx1 = ["CCL20", "CXCL3", "CASP7", "MMP9", "CXCL1", "IL1B", "CSF2", "CXCL2", "PTGS2", "IL6",
             "VCAM1", "RPS6KA5", "JAG1", "NFKBIA", "MAP3K5", "TNF", "TNFAIP3", "CREB3L2", "ICAM1",
             "MAPK11", "CXCL10", "NOD2", "IKBKB", "BIRC3", "IRF1", "TRAF1", "TAB3", "CREB3L1", "FAS",
             "BCL3", "RELA", "MAPK12", "MAP2K7", "PGAM5", "MAP3K7", "MMP14", "JUNB", "MAPK14", "CHUK",
             "JUN", "NFKB1", "AKT2", "AKT1", "CCL2", "IL15", "CFLAR", "IKBKG", "RIPK3", "PIK3CA", "TRADD",
             "BIRC2", "MAP2K6", "CEBPB", "MLKL", "MAPK13", "DNM1L", "ATF2", "IFNB1", "TRAF5", "ITCH",
             "CREB3L4", "ATF6B", "SOCS3", "DAB2IP", "CREB1", "MAPK1", "CCL5", "MAP2K4", "MAPK8", "PIK3R1",
             "IFI47", "MAP3K14", "MAP3K8", "MAP2K3", "TAB2", "LIF", "RIPK1", "BAG4", "ATF4", "MAPK9", "PIK3R2",
             "CSF1", "MAPK3", "CREB5", "AKT3", "PIK3R3", "TRAF3", "FOS", "TNFRSF1B", "CREB3", "FADD", "PIK3CB",
             "CASP3", "TNFRSF1A", "TRAF2", "CX3CL1", "TAB1", "MAP2K1", "CASP8", "GM5431", "MMP3", "CREB3L3",
             "SELE", "IL18R1", "EDN1", "PIK3CD", "CCL12", "LTA"]
tnf_prdx1_ensemble = gene_name_df.loc[gene_name_df["external_gene_name"].str.upper().isin(tnf_prdx1)]
tnf_prdx1_ensemble =tnf_prdx1_ensemble[["ensembl_gene_id", "ensembl_gene_id_version", "external_gene_name"]]

In [17]:
deseq_mof = os.path.join(in_path, "ddr_lps3_shmof_lps3_shctrl.tsv")
deseq_mof_df = pd.read_csv(deseq_mof, sep = "\t")
deseq_mof_df['ensembl_gene_id'] = deseq_mof_df['GeneID'].str.split('.', 1).str[0]
deseq_mof_df = deseq_mof_df.merge(tnf_mof_ensemble, on = "ensembl_gene_id", how = "inner")
deseq_mof_df.to_csv(os.path.join(out_path, "tnf_genes_mof_lp3.tsv"), sep = "\t", index = None)
deseq_prdx1 = os.path.join(in_path, "ddr_lps3_shprdx1_lps3_shctrl.tsv")
deseq_prdx1_df = pd.read_csv(deseq_prdx1, sep = "\t")
deseq_prdx1_df['ensembl_gene_id'] = deseq_prdx1_df['GeneID'].str.split('.', 1).str[0]
deseq_prdx1_df = deseq_prdx1_df.merge(tnf_prdx1_ensemble, on = "ensembl_gene_id", how = "inner")
deseq_prdx1_df.to_csv(os.path.join(out_path, "tnf_genes_prdx1_lp3.tsv"), sep = "\t", index = None)
