We aim to use exomePeak package for each treatment separately.

In [1]:
%reload_ext rpy2.ipython

In [2]:
%%R
suppressMessages(suppressWarnings(library (GenomicFeatures)))
suppressMessages(suppressWarnings(library (exomePeak)))

In [25]:
%%R
suppressMessages(suppressWarnings(library (Guitar)))

In [3]:
%%R 
mkdir <- function (output_dir){
    if (!dir.exists(output_dir)){
        dir.create(output_dir)
        print(output_dir)
        print("created!")
    } else {
        print(output_dir)
        print("already exists!")
    }
}

In [None]:
%%R
######################################## genome ####################################
GTF = '~/genomes/hg38/gencode.v34/gencode.v34.annotation.gtf'

txdb <- makeTxDbFromGFF(GTF, organism=NA )

R[write to console]: Import genomic features from the file as a GRanges object ... 
R[write to console]: OK

R[write to console]: Prepare the 'metadata' data frame ... 
R[write to console]: OK

R[write to console]: Make the TxDb object ... 
R[write to console]: OK



In [11]:
%%R
txdb

TxDb object:
# Db type: TxDb
# Supporting package: GenomicFeatures
# Data source: ~/genomes/hg38/gencode.v34/gencode.v34.annotation.gtf
# Organism: NA
# Taxonomy ID: NA
# miRBase build ID: NA
# Genome: NA
# transcript_nrow: 228048
# exon_nrow: 748089
# cds_nrow: 275255
# Db created by: GenomicFeatures package from Bioconductor
# Creation time: 2023-02-28 19:52:09 -0800 (Tue, 28 Feb 2023)
# GenomicFeatures version at creation time: 1.36.4
# RSQLite version at creation time: 2.2.5
# DBSCHEMAVERSION: 1.2


# DMSO

In [4]:
import glob
glob.glob('bam/U*')

['bam/U1.input.bam.bai',
 'bam/U1.m6A.bam.bai',
 'bam/U2.input.bam.bai',
 'bam/U2.m6A.bam.bai',
 'bam/U1.input.bam',
 'bam/U1.m6A.bam',
 'bam/U2.input.bam',
 'bam/U2.m6A.bam']

## 1. Run `exomePeak`

In [5]:
# %%R
######################################## read meta ######################################$

Samples = c("U1","U2")
IP = '.m6A'
INPUT = '.input'

OUTPUT = 'exomepeak'

Samples = unlist(Samples)

IP_BAM = paste(Samples, IP, '.bam', sep='')
INPUT_BAM = paste(Samples, INPUT, '.bam', sep='')

In [9]:
# %%R 
setwd("bam")

In [10]:
# %%R
file.exists(INPUT_BAM)

In [None]:
# %%R
# options(digits=5)

# runexomepeak <- function(WINDOW = 50,STEP = 5,LENGTH = 200,ENRICH = 1){
#     EXP = paste('WINDOW',WINDOW,'STEP',STEP,'LENGTH',LENGTH,'ENRICH',ENRICH,sep='-')

#     print (EXP)
    
#     res <- exomepeak(
#         TXDB = txdb,
#         IP_BAM=IP_BAM,
#         INPUT_BAM=INPUT_BAM,

#         OUTPUT_DIR=paste('..',OUTPUT,sep='/'),
#         EXPERIMENT_NAME=EXP,

#         # options
#         WINDOW_WIDTH = WINDOW,
#         SLIDING_STEP = STEP,
#         FRAGMENT_LENGTH = LENGTH,
#     #     PEAK_CUTOFF_PVALUE = 1,
#     #     PEAK_CUTOFF_FDR = 1, # as.double(FDR),
#         FOLD_ENRICHMENT = ENRICH
#     )

#     mkdir(paste('..', OUTPUT, sep='/'))
#     mkdir(paste('..', OUTPUT, EXP, sep='/'))

#     saveRDS(res, paste('..', OUTPUT, EXP, 'results.rds', sep='/'))
# }

In [None]:
%%R
options(digits=5)

WINDOW = 50 # 10 or 25
STEP = 5 
LENGTH = 200 # 150, 100 or 50
ENRICH = 1
EXP = paste('WINDOW',WINDOW,'STEP',STEP,'LENGTH',LENGTH,'ENRICH',ENRICH,sep='-')

res <- exomepeak(
    TXDB = txdb,
    IP_BAM=IP_BAM,
    INPUT_BAM=INPUT_BAM,
    
    OUTPUT_DIR=paste('..',OUTPUT,sep='/'),
    EXPERIMENT_NAME=EXP,

    # options
    WINDOW_WIDTH = WINDOW,
    SLIDING_STEP = STEP,
    FRAGMENT_LENGTH = LENGTH,
#     PEAK_CUTOFF_PVALUE = 1,
#     PEAK_CUTOFF_FDR = 1, # as.double(FDR),
    FOLD_ENRICHMENT = ENRICH
)

mkdir(paste('..', OUTPUT, sep='/'))
mkdir(paste('..', OUTPUT, EXP, sep='/'))

saveRDS(res, paste('..', OUTPUT, EXP, 'results.rds', sep='/'))

In [None]:
%%R 
setwd("../")

___

## 2. `RGAC` & `DRACH` motif analysis 

In [6]:
cat ~/GitHub/imRIP/scr/exomepeak-motif.sh

MAIN=$1
MOTIF=/rumi/shams/abe/GitHub/imRIP/motifs.txt

cd ${MAIN}
for sam in *; do
	cd $sam
	echo "__________________________________________________________________________________________"
	echo $sam
	echo "step 1: extract mRNA sequences"
	cat peak.bed | sort -k1,1 -k2,2n peak.bed | cgat bed2bed --method=merge --merge-by-name |  awk '! /#/' | bedtools getfasta -name -s -fi /rumi/shams/genomes/hg38/hg38.fa -bed - -split -fo peak.fa
	echo "--- DONE! ---"

	echo "step 2: prepare inputs for FIRE"
	# perl $TEISERDIR/prep_seqs_for_teiser_run.pl peak.fa peaks
	/rumi/shams/abe/anaconda3/envs/cgat/bin/python $TEISERDIR/prep_fasta_for_fire_run.py peak.fa
	echo "--- DONE! ---"

	echo "step 3: run FIRE for known m6A motifs (non-discovery mode)"
	perl $FIREDIR/fire.pl --expfile=peak_fire.txt --exptype=discrete --fastafile_rna=peak_fire.fa \
	--nodups=1 --dodna=0 --dodnarna=0 --species=human --doskipdiscovery=1 \
	--motiffile_rna=$MOTIF --oribiasonly=0 > non-discovery_FIRE.log
	rm -rv non-discover

In [None]:
!bash ~/GitHub/imRIP/scr/exomepeak-motif.sh ~/Projects/Decitabine-treatment/meRIP-seq/exomepeak

Let's examine how exomepeak paramaters affect FIRE's results: 

In [18]:
import pandas as pd
from glob import glob

In [102]:
[
    [
        l.split('/')[1],
        pd.read_csv(l,header=None,sep='\t')
    ]
    for l in glob('exomepeak/*/non-discovery_FIRE/RNA/peak_fire.txt.signif.motifs.rep')
]
 
# pd.concat(, axis=0)

[['WINDOW-50-STEP-5-LENGTH-200-ENRICH-1',
                    0  1   2  3      4     5
  0           [AG]GAC  0  49  0  0.790  6902
  1           [AG]GAC  1  51  1  0.824  6902
  2  [AGT][AG]AC[ACT]  0  49  0  0.944  6902
  3  [AGT][AG]AC[ACT]  1  51  1  0.949  6902],
 ['WINDOW-50-STEP-5-LENGTH-50-ENRICH-1',
                    0  1   2  3      4      5
  0           [AG]GAC  0  50  0  0.498  20609
  1           [AG]GAC  1  50  1  0.518  20609
  2  [AGT][AG]AC[ACT]  0  50  0  0.744  20609
  3  [AGT][AG]AC[ACT]  1  50  1  0.755  20609],
 ['WINDOW-10-STEP-5-LENGTH-150-ENRICH-1',
                    0  1   2  3      4    5
  0           [AG]GAC  0  49  0  0.722  227
  1           [AG]GAC  1  51  1  0.758  227
  2  [AGT][AG]AC[ACT]  0  45  0  0.863  227
  3  [AGT][AG]AC[ACT]  1  55  1  0.930  227],
 ['WINDOW-50-STEP-5-LENGTH-150-ENRICH-1',
                    0  1   2  3      4     5
  0           [AG]GAC  0  49  0  0.724  8899
  1           [AG]GAC  1  51  1  0.762  8899
  2  [AGT][AG]AC[

`WINDOW-50-STEP-5-LENGTH-150-ENRICH-1` seem the best which shows the enrichment of `RGAC` motif. 

___

### WINDOW-50-STEP-5-LENGTH-150-ENRICH-1


In [44]:
ls exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1

con_peak.bed        exomePeak.Rdata         peak.fa        results.rds
con_peak.xls        [0m[34;42mnon-discovery_FIRE[0m/     peak_fire.fa
[34;42mdiscovery_FIRE[0m/     non-discovery_FIRE.log  peak_fire.txt
discovery_FIRE.log  peak.bed                peak.xls


In [5]:
%%bash 
echo `pwd`
cd exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1
echo `pwd`
# export FIREDIR=/flash/bin/FIRE-1.1
# export LD_LIBRARY_PATH=/flash/bin/FIRE-1.1/modules/lib
# date

mv non-discovery_FIRE peak_fire.txt_FIRE

echo "RNA, Step 7: draw matrix figure."
perl /flash/bin/FIRE-1.1/SCRIPTS/mi_draw_matrix.pl \
    --expfile=peak_fire.txt_FIRE/RNA/peak_fire.txt \
    --matfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.matrix \
    --summaryfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.summary \
    --columnsfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.columns \
    --ps2pdf=1 --every=1 --quantized=1 --motifnames=peak_fire.txt_FIRE/RNA/peak_fire.txt.motifnames \
    --ybase=250 --colmap=/flash/bin/FIRE-1.1/SCRIPTS/HEATMAPS/cmap2.txt \
    --clustfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.clusters  \
    --lp_t_draw=10 \
    --gofile=peak_fire.txt_FIRE/RNA/peak_fire.txt.GO 

mv peak_fire.txt_FIRE non-discovery_FIRE 

cd ../../

echo `pwd`

/rumi/shams/abe/Projects/Decitabine-treatment/meRIP-seq
/rumi/shams/abe/Projects/Decitabine-treatment/meRIP-seq/exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1
RNA, Step 7: draw matrix figure.
Now doing the graphical display.
xsize = 1200, ysize = 400, xbase = 35, ybase = 250
Processing [AG]GAC ... Outputing motif 0.eps ... Done.
Plotting significance boxes.
Creating peak_fire.txt_FIRE/RNA/peak_fire.txt.summary.eps ...Done.
Creating PDF peak_fire.txt_FIRE/RNA/peak_fire.txt.summary.pdf ... Done.
/rumi/shams/abe/Projects/Decitabine-treatment/meRIP-seq


In [7]:
!cp -v exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1/non-discovery_FIRE/RNA/peak_fire.txt.summary.pdf plots/FIRE-known-motifs.pdf
!bash ~/GitHub/Abe/my_scripts/pdf2png.sh plots/FIRE-known-motifs.pdf

'exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1/non-discovery_FIRE/RNA/peak_fire.txt.summary.pdf' -> 'plots/FIRE-known-motifs.pdf'
plots/FIRE-known-motifs.pdf > plots/FIRE-known-motifs.png
done!


# Decitabine

In [5]:
glob.glob('bam/T*')

['bam/T1.input.bam.bai',
 'bam/T1.m6A.bam.bai',
 'bam/T2.input.bam.bai',
 'bam/T2.m6A.bam.bai',
 'bam/T1.input.bam',
 'bam/T1.m6A.bam',
 'bam/T2.input.bam',
 'bam/T2.m6A.bam']

## 1. Run `exomePeak`

In [9]:
%%R
######################################## read meta ######################################$

Samples = c("T1","T2")
IP = '.m6A'
INPUT = '.input'

OUTPUT = 'exomepeak'

Samples = unlist(Samples)

IP_BAM = paste(Samples, IP, '.bam', sep='')
INPUT_BAM = paste(Samples, INPUT, '.bam', sep='')

In [12]:
%%R 
setwd("bam")

In [13]:
%%R
file.exists(INPUT_BAM)

[1] TRUE TRUE


In [None]:
%%R
options(digits=5)

WINDOW = 50 # 10 or 25
STEP = 5 
LENGTH = 200 # 150, 100 or 50
ENRICH = 1
EXP = paste('WINDOW',WINDOW,'STEP',STEP,'LENGTH',LENGTH,'ENRICH',ENRICH,sep='-')

res <- exomepeak(
    TXDB = txdb,
    IP_BAM=IP_BAM,
    INPUT_BAM=INPUT_BAM,
    
    OUTPUT_DIR=paste('..',OUTPUT,sep='/'),
    EXPERIMENT_NAME=EXP,

    # options
    WINDOW_WIDTH = WINDOW,
    SLIDING_STEP = STEP,
    FRAGMENT_LENGTH = LENGTH,
#     PEAK_CUTOFF_PVALUE = 1,
#     PEAK_CUTOFF_FDR = 1, # as.double(FDR),
    FOLD_ENRICHMENT = ENRICH
)

mkdir(paste('..', OUTPUT, sep='/'))
mkdir(paste('..', OUTPUT, EXP, sep='/'))

saveRDS(res, paste('..', OUTPUT, EXP, 'results.rds', sep='/'))

R[write to console]: 'select()' returned 1:many mapping between keys and columns



[1] "Divide transcriptome into chr-gene-batch sections ..."
[1] "Get Reads Count ..."
[1] "This step may take a few hours ..."
[1] "0.275 %"
[1] "0.549 %"
[1] "0.824 %"
[1] "1.1 %"
[1] "1.37 %"
[1] "1.65 %"
[1] "1.92 %"
[1] "2.2 %"
[1] "2.47 %"
[1] "2.75 %"
[1] "3.02 %"
[1] "3.3 %"
[1] "3.57 %"
[1] "3.85 %"
[1] "4.12 %"
[1] "4.4 %"
[1] "4.67 %"
[1] "4.95 %"
[1] "5.22 %"
[1] "5.49 %"
[1] "5.77 %"
[1] "6.04 %"
[1] "6.32 %"
[1] "6.59 %"
[1] "6.87 %"
[1] "7.14 %"
[1] "7.42 %"
[1] "7.69 %"
[1] "7.97 %"
[1] "8.24 %"
[1] "8.52 %"
[1] "8.79 %"
[1] "9.07 %"


In [16]:
%%R 
setwd("../")

___

## 2. Metagene plots
Now, let's draw **metagene plots**: (using seprate conda env and ipython kernel)

In [26]:
%%R 
GuitarPlot(txTxdb = txdb,stBedFiles = list("exomepeak/WINDOW-50-STEP-5-LENGTH-200-ENRICH-1/peak.bed"),miscOutFilePrefix = "Guitar")

[1] "20230301113706"
[1] "There are 228048 transcripts of 60669 genes in the genome."
[1] "total 228048 transcripts extracted ..."
[1] "total 97941 transcripts left after ambiguity filter ..."
[1] "total 97941 transcripts left after check chromosome validity ..."
[1] "total 10939 mRNAs left after component length filter ..."
[1] "total 60598 ncRNAs left after ncRNA length filter ..."
[1] "generate components for all tx"
[1] "generate components for mRNA"
[1] "generate components for lncRNA"
[1] "generate chiped transcriptome"
[1] "generate coverage checking ranges for tx"
[1] "generate coverage checking ranges for mrna"
[1] "generate coverage checking ranges for ncrna"
[1] "20230301114018"
[1] "import BED file exomepeak/WINDOW-50-STEP-5-LENGTH-200-ENRICH-1/peak.bed"
[1] "sample 10 points for Group1"
[1] "start figure plotting for tx ..."
[1] "start figure plotting for mrna ..."
[1] "start figure plotting for ncrna ..."


In [28]:
%%bash 
for f in Guitar_*; do 
mv -v $f plots/DAC_$f;
done

‘Guitar_mrna_test.pdf’ -> ‘plots/DAC_Guitar_mrna_test.pdf’
‘Guitar_ncrna_test.pdf’ -> ‘plots/DAC_Guitar_ncrna_test.pdf’
‘Guitar_tx_test.pdf’ -> ‘plots/DAC_Guitar_tx_test.pdf’


## 3. `RGAC` & `DRACH` motif analysis 

In [18]:
cat /data_gilbert/home/aarab/Workflows/imRIP/scr/exomepeak-motif.sh

MAIN=$1
MOTIF=/rumi/shams/abe/GitHub/imRIP/motifs.txt

cd ${MAIN}
for sam in *; do
	cd $sam
	echo "__________________________________________________________________________________________"
	echo $sam
	echo "step 1: extract mRNA sequences"
	cat peak.bed | sort -k1,1 -k2,2n peak.bed | cgat bed2bed --method=merge --merge-by-name |  awk '! /#/' | bedtools getfasta -name -s -fi /rumi/shams/genomes/hg38/hg38.fa -bed - -split -fo peak.fa
	echo "--- DONE! ---"

	echo "step 2: prepare inputs for FIRE"
	# perl $TEISERDIR/prep_seqs_for_teiser_run.pl peak.fa peaks
	/rumi/shams/abe/anaconda3/envs/cgat/bin/python $TEISERDIR/prep_fasta_for_fire_run.py peak.fa
	echo "--- DONE! ---"

	echo "step 3: run FIRE for known m6A motifs (non-discovery mode)"
	perl $FIREDIR/fire.pl --expfile=peak_fire.txt --exptype=discrete --fastafile_rna=peak_fire.fa \
	--nodups=1 --dodna=0 --dodnarna=0 --species=human --doskipdiscovery=1 \
	--motiffile_rna=$MOTIF --oribiasonly=0 > non-discovery_FIRE.log
	rm -rv non-discover

In [None]:
!bash /data_gilbert/home/aarab/Workflows/imRIP/scr/exomepeak-motif.sh exomepeak/WINDOW-50-STEP-5-LENGTH-200-ENRICH-1/

Let's examine how exomepeak paramaters affect FIRE's results: 

In [None]:
import pandas as pd
from glob import glob

In [None]:
[
    [
        l.split('/')[1],
        pd.read_csv(l,header=None,sep='\t')
    ]
    for l in glob('exomepeak/*/non-discovery_FIRE/RNA/peak_fire.txt.signif.motifs.rep')
]
 
# pd.concat(, axis=0)

`WINDOW-50-STEP-5-LENGTH-150-ENRICH-1` seem the best which shows the enrichment of `RGAC` motif. 

___

### WINDOW-50-STEP-5-LENGTH-150-ENRICH-1


In [None]:
%%bash 
echo `pwd`
cd exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1
echo `pwd`
# export FIREDIR=/flash/bin/FIRE-1.1
# export LD_LIBRARY_PATH=/flash/bin/FIRE-1.1/modules/lib
# date

mv non-discovery_FIRE peak_fire.txt_FIRE

echo "RNA, Step 7: draw matrix figure."
perl /flash/bin/FIRE-1.1/SCRIPTS/mi_draw_matrix.pl \
    --expfile=peak_fire.txt_FIRE/RNA/peak_fire.txt \
    --matfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.matrix \
    --summaryfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.summary \
    --columnsfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.columns \
    --ps2pdf=1 --every=1 --quantized=1 --motifnames=peak_fire.txt_FIRE/RNA/peak_fire.txt.motifnames \
    --ybase=250 --colmap=/flash/bin/FIRE-1.1/SCRIPTS/HEATMAPS/cmap2.txt \
    --clustfile=peak_fire.txt_FIRE/RNA/peak_fire.txt.clusters  \
    --lp_t_draw=10 \
    --gofile=peak_fire.txt_FIRE/RNA/peak_fire.txt.GO 

mv peak_fire.txt_FIRE non-discovery_FIRE 

cd ../../

echo `pwd`

In [None]:
!cp -v exomepeak/WINDOW-50-STEP-5-LENGTH-150-ENRICH-1/non-discovery_FIRE/RNA/peak_fire.txt.summary.pdf plots/FIRE-known-motifs.pdf
!bash ~/GitHub/Abe/my_scripts/pdf2png.sh plots/FIRE-known-motifs.pdf

# 

In [29]:
!conda env export --from-history 

name: /data_gilbert/home/aarab/anaconda3/envs/mamba/envs/exomepeak
channels:
  - numba
  - anaconda
  - r
  - conda-forge
  - bioconda
dependencies:
  - bioconductor-exomepeak
  - bioconductor-genomicfeatures
  - r-ggplot2
  - r-tidyverse
  - pip
  - pandas
  - numpy
  - ipykernel
  - r-irkernel
  - bioconductor-guitar
  - ca-certificates
  - openssl
prefix: /data_gilbert/home/aarab/anaconda3/envs/mamba/envs/exomepeak


In [30]:
%%R 
sessionInfo()

R version 3.6.1 (2019-07-05)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /data_gilbert/home/aarab/anaconda3/envs/mamba/envs/exomepeak/lib/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] stats4    parallel  tools     stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] Guitar_2.0.0                dplyr_1.0.6                
 [3] knitr_1.33                  ggplot2_3.3.3              
 [5] magrittr_2.0.1              exomePeak_2.17.0           
 [7] GenomicAlignments_1.20.1    SummarizedExperiment_1.14

In [31]:
!date

Wed Mar  1 11:46:27 PST 2023
