-
Notifications
You must be signed in to change notification settings - Fork 0
/
Single_molecule_methylation_call_CG_GC_WG_MM_TFcentered_EK2022.r
142 lines (119 loc) · 5.89 KB
/
Single_molecule_methylation_call_CG_GC_WG_MM_TFcentered_EK2022.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#TITLE: WG MM TF single molecule methylation call for CpG and GpC
#AUTHOR: Arnaud Krebs & Elisa Kreibich
#DATE: 27-10-2022
#VERSION: TFBS analysis strategy
#AIM: Making the single molecule methylation call for CGs and GCs in a TF-motif-centered with at least one CpG fashion.
# The output file is a multi-level list object with the following structure: [[C-context (1:GC|2:CG)]] [[TF bins (1:upstream | 2:middle | 3:downstream)]] [[TFBS]] [[binding state (000-111)]] vector of methylation values of single molecule reads.
# VERSION USES JASPAR 2018 MOTIFS
# Load libraries and dependencies ------------------------------------------------------
library(QuasR)
library("BSgenome.Mmusculus.UCSC.mm10")
source('scripts/utilities/RFunctions_for_single_molecule_call.R')
# Load arguments ---------------------------------------------------------------------
# Extract arguments when running on the cluster
args = commandArgs(trailingOnly=TRUE)
if (length(args)==0) {
stop("At least one argument must be supplied (input file).n", call.=FALSE)
} else if (length(args)==1) {
# default output file
args[2] = "out.txt"
}
INPUT_QUASR = args[1] #QuasR input file for qAlign (tab-delimted file with FileName (location of BAM file) and SampleName) | best only one sample/ replicate per input file | see example file
WDW_SIZE = args[2] #101 bp for CA analysis used in Kreibich et al, 2023
CHR1 = args[3] #chromosome to start with (e.g. "1")
CHR2 = args[4] #chromosome to end with (e.g. "19")
DIR_WD = args[5] #working directory (Github repository directory of Kreibich_2023_5mC_at_enhancers)
INPUT_TFBS = args[6] #input GRanges file for TFBS regions - from JASPAR 2018
replicate = "" #put "", when no selection of replicate, then the script runs through all samples and reps listed in the input file
## Define arguments when NOT running on the cluster
# INPUT_QUASR = 'data/example/QuasR_aligned_files_ES_NO_R1_R2_R5a6_rmdup.txt'
# WDW_SIZE = 30
# CHR1 = 1
# CHR2 = 19
# DIR_WD = './Kreibich_2023_5mC_at_enhancers/' #location of this repository
# INPUT_TFBS = 'data/GRange_mapped_jaspar2018_ChIP_score10_inBaits_BANP.rds'
# Load environment ---------------------------------------------------------------------
dir.create(path = paste0(DIR_WD, 'data_results/')
dir.create(path = paste0(DIR_WD, 'data_results/single_molecule_call/')
OUTDIR <- paste0(DIR_WD, 'data_results/single_molecule_call/')
# Load aligned data ---------------------------------------------------------------------
Qproj = qAlign(sampleFile = INPUT_QUASR,
genome = "BSgenome.Mmusculus.UCSC.mm10",
paired = "fr",
bisulfite = "undir")
Qproj@aligner = "Rbowtie"
Qaln = as.data.frame(alignments(Qproj)[[1]])
sampleNames = unique(alignments(Qproj)[[1]][,2])
if(!replicate == ""){
sampleNames = sampleNames[grepl(replicate, sampleNames)]
}
message("Aligned data loaded")
# Load mapped motifs ---------------------------------------------------------------------
#ChIP validated updated JASPAR OBJECT 2018 - including BANP - only within Baits
MotDBf <- readRDS(INPUT_TFBS)
MotDBfs <- MotDBf#[MotDBf$name=='REST']
wmMapped_SOu <- MotDBfs
message("TF motifs loaded")
# Step 1: Create genomic chunks (based on TF positions) ---------------------------------------------------------------------
## Sort TFobject
wms <- (wmMapped_SOu[order(start(wmMapped_SOu))])
## Split by chr
wms.c <- split(wms, seqnames(wms))
regs <- unlist(GRangesList(lapply(seq_along(wms.c), function(i){
x <- wms.c[[i]]
l <- length(x)
if(l>0){
#cut by TF position
seq.i <- seq(1,l,round(l/25))
st.seq <- start(x[seq.i])
lowB <- st.seq[1:(length(st.seq)-1)]-1000
lowB[lowB<0] <- 1
highB <- st.seq[2:(length(st.seq))]+1000
highB[highB>seqlengths(Mmusculus)[as.character(seqnames(wms.c[[i]])[1])]] <- seqlengths(Mmusculus)[as.character(seqnames(wms.c[[i]])[1])]
gr <- GRanges(seqnames(x[1]), IRanges(lowB, highB))
gr
}else(GRanges())
})))
message("TF motifs sorted")
# Step 2: Perform vectorial read extraction ---------------------------------------------------------------------
## Define the sample you want to analyse
sp.sbs <- seq_along(sampleNames)[seq_along(sampleNames)]
sampleNames[sp.sbs]
## Read extraction
for(sp in sp.sbs){
# sp=1
print(sampleNames[sp])
for(CHR in seq(CHR1, CHR2)){
# CHR = 19
print(CHR)
regsD <- cbind(as.data.frame(regs)[,1:3], sample = sampleNames[sp])
regsD <- regsD[regsD[,1] %in% paste('chr', 1:19, sep = ''),]
regsD <- regsD[regsD[,1] %in% paste('chr', CHR, sep = ''),]
regsL <- split(regsD, 1:nrow(regsD))
print(as.character(sampleNames[sp]))
sRs <- mclapply(seq_along(regsL), function(i){
# i=1
# regDF=regsL[[i]]
# target_range=wmMapped_SOu
# projAll=Qproj
if(WDW_SIZE == 14){
sR <- extract.CG.GC.over.TF.states(regsL[[i]], Qproj, wmMapped_SOu, inMs=c(-7,7), upMs=c(-35,-25), doMs=c(25,35))
} else if(WDW_SIZE == 30) {
sR <- extract.CG.GC.over.TF.states(regsL[[i]], Qproj, wmMapped_SOu, inMs=c(-15,15), upMs=c(-35,-25), doMs=c(25,35))
}
sR
}, mc.cores = 10)
### Merge the genomic bins --> return a list organised by (1) context, (2) bin type
sRs_t <- lapply(seq(2), function(context){
lapply(seq(3), function(bin){
sR_g <- lapply(seq(sRs), function(gbin){
sRs[[gbin]][[context]][[bin]]
})
do.call(c, sR_g)
})
})
saveRDS(sRs_t, paste0(OUTDIR, 'tmp_', sampleNames[sp], '_CG_TF_jaspar2018_ChIP_score10_inBaits_BANP_sorted_met_vect_inM', WDW_SIZE, '_chr', CHR,'_NWCGW.rds'))
message(paste(paste0(OUTDIR, 'tmp_', sampleNames[sp], '_CG_TF_jaspar2018_ChIP_score10_inBaits_BANP_sorted_met_vect_inM', WDW_SIZE, '_chr', CHR,'_NWCGW.rds'), 'saved'))
}
}
message(Sys.time(), ': Script finished. Input arguments were:\nfile: ', INPUT_QUASR, '\nWDW_SIZE: ', WDW_SIZE, '\nchromosomes: ', CHR1, ' to ', CHR2)