# Pathway enrichment analysis

<!-- Luke and Ray has shared the CRISPRi screening results which was analyzed with [ScreenProcessing](https://github.com/mhorlbeck/ScreenProcessing) pipeline.  -->

<!-- - $\gamma$ - gamma score -->
<!-- - $\rho$ - rho score -->

<!-- - Pathway enrichment analysis over rho scores -->
<!-- - Load screening result tables into python 
- Make sure gene names are correctly assigned 
 -->

<!-- Alex Ge: 
> If we do Enrichr analysis on the resistance/sensitivity hits defined by Max’s cutoffs, (n = 418 genes), we do see mRNA methylation (adj p = 0.018) and RNA destabilization (adj p = 0.005) come out as significant GO biological processes. METTL3 is included in these GO terms.

> If we do Enrichr analysis on just the resistance hits (n = 197), mRNA methylation is even more significant (adj p = 0.002), which makes sense since we see more METTL3 biology on the resistance side. It is one of the top five GO terms by adjusted P-value.

> This analysis was done today with the 2021 GO terms, which have updated annotations for the newer m6A genes. When I did the same Enrichr analysis in 2018, RNA destabilization and mRNA methylation were not as significant since the GO annotations were not updated.

> I think Enrichr analysis might make more sense here – in Abe’s analysis, I can see that a lot of rho scores that are < 0.2 are being included in the analysis; these are likely to be statistically insignificant. It also looks graphically like the highest bin is including rho values that are < 0?
 -->

In [1]:
from matplotlib_venn import venn2

In [2]:
from IPython.display import IFrame

In [3]:
from glob import glob

import sys
import pandas as pd
import numpy as np 
from itertools import chain, product

sys.path.append("../../")
pager_dir = "/data_gilbert/home/aarab/Projects/pager/"
sys.path.append(pager_dir)

from scripts.util import *
import ipage_down as ipd

In [4]:
# wd = '/rumi/shams/abe/Projects/Decitabine-treatment/'
wd = '/data_gilbert/home/aarab/Projects/Decitabine-treatment/DAC'

In [5]:
data = load_data(screens=True,wd=wd)

In [6]:
data.keys()

dict_keys(['hl60_exp1_DAC_rho', 'hl60_exp1_DAC_gamma', 'hl60_exp2_DAC_rho', 'hl60_exp2_DAC_gamma', 'hl60_exp2_GSK_rho', 'hl60_exp2_GSK_gamma', 'molm13_exp_DAC_rho', 'molm13_exp_DAC_gamma', 'molm13_exp_GSK_rho', 'molm13_exp_GSK_gamma'])

## Run `iPAGE`:

https://medium.com/analytics-vidhya/techniques-to-transform-data-distribution-565a4d0f2da


In [93]:
from matplotlib import pyplot
from scipy.stats import yeojohnson

rho = pd.concat(find_top(data['hl60_exp1_DAC_rho'].astype(float),'rho score',0,'Mann-Whitney p-value',1)).reset_index()

up:  8896
down: 9864


In [103]:
def fdr_diff_table(df,fold_change,stat_val):
    df['fdr'] = np.sign(df[fold_change] ) * (1 - df[stat_val])
    return df

In [189]:
rho[['gene_name','rho score']].to_csv(
    'hl60_exp1_DAC_rho_delta_phenotype.txt',sep='\t',index=None, header=None
)

In [190]:
!head hl60_exp1_DAC_rho_delta_phenotype.txt

gene_name	rho score
A1CF	0.00493164595946
A2ML1	0.0625139181208
A4GALT	0.0952827862429
A4GNT	0.00884423785995
AADACL4	0.0252223337879
AAED1	0.0294955169881
AAK1	0.0549906505538
AAR2	0.118809960392
AARS2	0.179262551668


In [109]:
fdr_diff_table(
    rho,'rho score','Mann-Whitney p-value'
).set_index('gene_name')[['fdr']].to_csv(
    'hl60_exp1_DAC_rho_delta_phenotype_fdr.txt',sep='\t',header=None
)

In [110]:
!head hl60_exp1_DAC_rho_delta_phenotype_fdr.txt

A1CF	0.20091296344200005
A2ML1	0.825801017649
A4GALT	0.99259028423435
A4GNT	0.11282162900899995
AADACL4	0.129521651322
AAED1	0.43757634167399995
AAK1	0.596610457282
AAR2	0.896198649463
AARS2	0.999131696699001
AASS	0.49524476780500004


In [114]:
ls *delta_phenotype*.txt

hl60_exp1_DAC_rho_delta_phenotype_fdr.txt
hl60_exp1_DAC_rho_delta_phenotype.txt


In [115]:
%%bash 
export PAGEDIR=/data_gilbert/home/aarab/Workflows/iPAGE

nohup ls *delta_phenotype*.txt | parallel -j18 -k bash ~/Projects/pager/ipage_loop.sh  {} &> ipage.out

## Interpret results – `pager`
https://github.com/abearab/pager

In [116]:
exp = 'hl60_exp1_DAC_rho_delta_phenotype'

def get_pvmatrix_list(parent_path,pattern):
    """
    pattern: msigdb gene set cluster name 
    """
    return glob(f'{parent_path}/*{pattern}*/pvmatrix.txt')

### Draw iPAGE heatmap

### C5 GO

In [117]:
pdf = 'CRISPRi-rho-pager-GO_all.pdf'

ipd.merge_multiple_pvmat(
    get_pvmatrix_list(exp,'c5.go')
).to_csv('temp-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    {exp}'.txt' "temp-pvmatrix.txt" \
    {pdf} &> /dev/null

!mv -v {pdf} plots/
!rm -v 'temp-pvmatrix.txt'

‘CRISPRi-rho-pager-GO_all.pdf’ -> ‘plots/CRISPRi-rho-pager-GO_all.pdf’
removed ‘temp-pvmatrix.txt’


In [118]:
pv_signal_go0 = pd.concat([ 
    ipd.pvmat2bio_signal( 
        ipd.merge_multiple_pvmat(get_pvmatrix_list(exp,'c5.go')), s, n_clust=n, thr=2
    ) for s in ['both'] for n in [1,2,3] 
])

In [123]:
# pv_signal_go0

In [122]:
# gs = 'GOBP_MRNA_PROCESSING'

# pd.DataFrame([
#     (n,','.join(list(ipd.bin_identifier_genes(
#         'hl60_exp1_DAC_rho_delta_phenotype/msigdb_v7.4_c5.go',str(n),gs
#     ).values())[0])) for n in [0,
#                                #1,2,3,4,6,7,8,9,
#                                10]
# ],columns=['clust',gs]).set_index('clust').to_dict()

In [16]:
pdf = 'CRISPRi-rho-pager-GO-both.pdf'

pv_signal_go0.to_csv('temp-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    {exp}'.txt' "temp-pvmatrix.txt" \
    {pdf} &> /dev/null
!mv -v {pdf} plots/
!rm -v 'temp-pvmatrix.txt'

‘CRISPRi-rho-pager-GO-both.pdf’ -> ‘plots/CRISPRi-rho-pager-GO-both.pdf’
removed ‘temp-pvmatrix.txt’


In [69]:
pv_signal_go1 = pd.concat([ 
    ipd.pvmat2bio_signal( 
        ipd.merge_multiple_pvmat(get_pvmatrix_list(exp,'c5.go')), s, n_clust=n, thr=1
    ) for s in ['up','down','both'] for n in [1,2] 
]) 

In [70]:
pdf = 'CRISPRi-rho-pager-GO.pdf'

pv_signal_go1.to_csv('temp-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    {exp}'.txt' "temp-pvmatrix.txt" \
    {pdf} &> /dev/null
!mv -v {pdf} plots/
!rm -v 'temp-pvmatrix.txt'

‘CRISPRi-rho-pager-GO.pdf’ -> ‘plots/CRISPRi-rho-pager-GO.pdf’
removed ‘temp-pvmatrix.txt’


In [71]:
IFrame("plots/CRISPRi-rho-pager-GO.pdf", width=600, height=300)

In [33]:
# pdf = 'CRISPRi-rho-pager-GO-down.pdf'

# pv_signal_go2.to_csv('temp-pvmatrix.txt',sep='\t')

# !bash {pager_dir}/ipage_draw_matrix.sh \
#     {exp}'.txt' "temp-pvmatrix.txt" \
#     {pdf} &> /dev/null
# !mv -v {pdf} plots/
# !rm -v 'temp-pvmatrix.txt'

### C2

In [125]:
pdf = 'CRISPRi-rho-pager-KEGG_all.pdf'

ipd.merge_multiple_pvmat(get_pvmatrix_list(exp,'c2.cp')).to_csv('temp-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    {exp}'.txt' "temp-pvmatrix.txt" \
    {pdf} &> /dev/null
!mv -v {pdf} plots/
!rm -v 'temp-pvmatrix.txt'

‘CRISPRi-rho-pager-KEGG_all.pdf’ -> ‘plots/CRISPRi-rho-pager-KEGG_all.pdf’
removed ‘temp-pvmatrix.txt’


In [181]:
pv_signal_c2 = pd.concat([
    ipd.pvmat2bio_signal(
        ipd.merge_multiple_pvmat(get_pvmatrix_list(exp,'c2.cp.kegg')),s, thr=2,
        n_clust=n
    )
    for s in ['up','both','down'] for n in [1,2,3]
])

In [182]:
pv_signal_c2

Unnamed: 0,[-0.12 -0.10],[-0.10 -0.07],[-0.07 -0.05],[-0.05 -0.03],[-0.03 -0.01],[-0.01 0.00],[0.00 0.02],[0.02 0.03],[0.03 0.05],[0.05 0.08],[0.08 0.1]
KEGG_AMINOACYL_TRNA_BIOSYNTHESIS,-0.872,0.235,0.235,-0.379,-0.872,0.235,0.235,0.93,-0.872,-0.872,3.543
KEGG_SPLICEOSOME,2.357,-0.446,-0.619,-0.446,-0.619,-1.449,-0.313,-0.619,0.561,-1.111,3.163


In [183]:
pdf = 'CRISPRi-rho-pager-C2.pdf'

pv_signal_c2.to_csv('temp-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    {exp}'.txt' "temp-pvmatrix.txt" \
    {pdf} &> /dev/null
!mv -v {pdf} plots/
!rm -v 'temp-pvmatrix.txt'

‘CRISPRi-rho-pager-C2.pdf’ -> ‘plots/CRISPRi-rho-pager-C2.pdf’
removed ‘temp-pvmatrix.txt’


In [184]:
IFrame("plots/CRISPRi-rho-pager-C2.pdf", width=600, height=300)

## C3

In [28]:
pvmat = ipd.merge_multiple_pvmat(
    pvmat_list = glob(f'{exp}/*c3*/pvmatrix.txt')
)

bio_signal = pd.concat([
    ipd.pvmat2bio_signal(pvmat,side='down',n_clust=1),
    ipd.pvmat2bio_signal(pvmat,side='up',n_clust=1),
    ipd.pvmat2bio_signal(pvmat,side='both'),
],axis=0)

bio_signal

Unnamed: 0,[-0.12 -0.10],[-0.10 -0.06],[-0.06 -0.04],[-0.04 -0.03],[-0.03 -0.01],[-0.01 0.00],[0.00 0.02],[0.02 0.03],[0.03 0.05],[0.05 0.08],[0.08 0.1]
GGCNKCCATNK_UNKNOWN,4.507,-0.774,0.61,-1.035,-1.359,-0.567,-0.281,-0.405,-0.774,-0.405,1.257
CAVIN1_TARGET_GENES,2.35,1.547,0.906,-0.527,-1.784,-0.811,1.206,-2.655,-0.811,-0.527,0.277
MIR4772_5P,0.75,-0.837,-0.513,-0.301,1.056,-1.328,-1.328,0.498,-1.328,0.498,2.781


In [15]:
pvmat_list = glob(f'{exp}/*c3*/pvmatrix.txt')
gs_cluster_path = ipd.detect_gs_cluster(pvmat_list, gs=gs)

print ([p.split('/')[1:3] for p in gs_cluster_path])

gs_cluster_path = gs_cluster_path[0].split('pvmatrix.txt')[0]


[['msigdb_v7.4_c3.all', 'pvmatrix.txt'], ['msigdb_v7.4_c3.mir.mirdb', 'pvmatrix.txt']]


In [35]:
gs = 'CAVIN1_TARGET_GENES'

pd.DataFrame([
    (n,','.join(list(ipd.bin_identifier_genes(
        f'{gs_cluster_path}',str(n),gs
    ).values())[0])) for n in [0,1,2,3,4,6,7,8,9,10]
],columns=['clust',gs]).set_index('clust')[gs][0]

'ACBD5,ARID4A,ATF7IP2,CMTM3,LEPROTL1,P4HB,RBBP4,SEC14L1,SLC34A1,STAP2,TCERG1,VDAC2,ZSCAN31'

## Identifier genes of enriched pathways

### GOBP_RNA_MODIFICATION

In [63]:
gs = 'GOBP_RNA_MODIFICATION'

pd.DataFrame([
    (n,','.join(list(ipd.bin_identifier_genes(
        'hl60_exp1_DAC_rho_delta_phenotype/msigdb_v7.4_c5.go',str(n),gs
    ).values())[0])) for n in [0,1,
                               # 2,3,4,6,7,8,
                               9,10]
],columns=['clust',gs]).set_index('clust').to_dict()

{'GOBP_RNA_MODIFICATION': {0: 'ADAT2,CDKAL1,CMTR1,CMTR2,DTWD1,FTSJ1,NOP2,NSUN2,RPUSD1,THUMPD2',
  1: 'ALKBH3,ALKBH5,DUS3L,LCMT2,METTL16,NAT10,NHP2',
  9: 'C9orf64,MEPCE,METTL1,METTL14,METTL5,NSUN4,NSUN6,PARN,RBM15,RPUSD3,RPUSD4,TRMT10A,TRUB1,TRUB2,URM1',
  10: 'AARS2,ALKBH1,ALKBH8,ANKRD16,BCDIN3D,CBLL1,CTU1,CTU2,DKC1,DUS2,ELP2,ELP3,ELP4,ELP5,ELP6,FTO,FTSJ3,GAR1,GTPBP3,KTI12,METTL3,METTL4,NAF1,PUS7L,PUSL1,RPUSD2,SSB,TPRKB,TRMT10C,TRMT2B,WDR4,ZC3H13'}}

### GOBP_NEGATIVE_REGULATION_OF_INTRINSIC_APOPTOTIC_SIGNALING_PATHWAY_BY_P53_CLASS_MEDIATOR

In [623]:
c5_go_gmt['GOBP_NEGATIVE_REGULATION_OF_INTRINSIC_APOPTOTIC_SIGNALING_PATHWAY_BY_P53_CLASS_MEDIATOR']

['KDM1A',
 'SIRT1',
 'ZNF385A',
 'ING2',
 'MIR21',
 'MDM2',
 'MIF',
 'MUC1',
 'PRKN',
 'TRIAP1',
 'TAF9B',
 'BCL2',
 'BDKRB2',
 'MARCHF7',
 'TAF9',
 'PTTG1IP',
 'ELL3',
 'BCL2L12',
 'ARMC10',
 'CD44',
 'CD74']

In [185]:
gs = 'GOBP_NEGATIVE_REGULATION_OF_INTRINSIC_APOPTOTIC_SIGNALING_PATHWAY_BY_P53_CLASS_MEDIATOR'

pd.DataFrame([
    (n,','.join(list(ipd.bin_identifier_genes(
        'hl60_exp1_DAC_rho_delta_phenotype/msigdb_v7.4_c5.go',str(n),gs
    ).values())[0])) for n in [0,
                               # 2,3,4,6,7,8,
                               10]
],columns=['clust',gs]).set_index('clust').to_dict()

{'GOBP_NEGATIVE_REGULATION_OF_INTRINSIC_APOPTOTIC_SIGNALING_PATHWAY_BY_P53_CLASS_MEDIATOR': {0: 'BCL2,CD44,CD74,KDM1A,PTTG1IP,TRIAP1',
  10: 'SIRT1,ZNF385A'}}

### GOCC_SPLICEOSOMAL_COMPLEX

In [82]:
gs = 'GOCC_SPLICEOSOMAL_COMPLEX'

pd.DataFrame([
    (n,','.join(list(ipd.bin_identifier_genes(
        'hl60_exp1_DAC_rho_delta_phenotype/msigdb_v7.4_c5.go',str(n),gs
    ).values())[0])) for n in [0,1,
                               # 2,3,4,6,7,8,
                               9,10]
],columns=['clust',gs]).set_index('clust').to_dict()

{'GOCC_SPLICEOSOMAL_COMPLEX': {0: 'API5,DDX23,DDX39B,DDX5,DHX15,DHX8,HNRNPM,HSPA8,IK,LSM7,PDCD7,PRPF38B,PRPF4,RBM5,SF3A3,SMU1,SNRPA1,SNRPB2,SNRPD2,SNRPG,SUGP1',
  1: 'ALYREF,CWC15,LGALS3,NCL,PRPF40A,SF3A2,SLU7,SNRNP200,SRSF1,TFIP11,USP39,ZMAT2',
  9: 'DHX32,GPATCH1,IVNS1ABP,PPP1R8,RNPC3,SF1,SF3B4,SNIP1,SNRPA,SNRPB,TRA2B,WDR83,ZMAT5',
  10: 'AAR2,AQR,BUD13,CTNNBL1,CWF19L1,HNRNPF,HNRNPH3,HNRNPK,HNRNPR,HNRNPU,LSM2,LSM5,LUC7L3,PHF5A,PNN,PPIE,PPIL1,PPIL3,PRPF8,RBM3,RBMXL2,SF3B5,SNRNP40,SNRNP70,SNRPC,SNRPF,SNW1,SREK1,SYF2,SYNCRIP,WBP11,WBP4,ZCRB1,ZNF830,ZRSR2'}}

# 

In [232]:
!date

Sat Sep 17 14:44:09 PDT 2022
