-
Notifications
You must be signed in to change notification settings - Fork 1
/
assemble_training_data.R
233 lines (202 loc) · 9.95 KB
/
assemble_training_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#' @title Assembles TOP training data for all TF x cell type combinations,
#' then split training data into 10 partitions
#' @description Prepares the training data for fitting TOP models.
#' It splits training data into 10 partitions and
#' assembles training data for all TF x cell type combinations
#' for each of the partitions.
#'
#' @param tf_cell_table A data frame listing all TF x cell type combinations
#' and the training data for each combination.
#' It should have at least three columns, with:
#' TF names, cell types, and file names of the individual training data for
#' each TF x cell type combination. The individual training data
#' should be in .rds or text (.txt, or .csv) format.
#' @param logistic_model Logical. If \code{logistic_model = TRUE},
#' prepare assembled data for the logistic version of TOP model.
#' If \code{logistic_model = FALSE}, prepare assembled data for the
#' quantitative occupancy model (default).
#' @param chip_col The column name of ChIP data in the individual training data
#' (default: \sQuote{chip}).
#' @param training_chrs Chromosomes used for training the model
#' (default: odd chromosomes, chr1, chr3, ..., chr21)
#' @param n_partitions Number of partitions to split the training data (default: 10).
#' @param n_cores Number of cores to run in parallel
#' (default: equal to \code{n_partitions}).
#' @param max_sites Max number of candidate sites to keep for
#' each TF x cell type combination (default: 50000). To reduce computation time,
#' randomly select \code{max_sites} candidate sites for
#' each TF x cell type combination, if the number of candidate sites
#' exceeds \code{max_sites}.
#' @param seed A number for the seed used when sampling sites.
#' @return A list of data frames (default: 10),
#' each containing one partition of the
#' training data with all TF x cell type combinations.
#' @import doParallel
#' @import foreach
#' @importFrom parallel detectCores
#'
#' @export
#' @examples
#' \dontrun{
#'
#' # tf_cell_table should have three columns with:
#' # TF names, cell types, and paths to the training data files, like:
#' # | tf_name | cell_type | data_file |
#' # |:------------:|:-------------:|:------------------------:|
#' # | CTCF | K562 | CTCF.K562.data.rds |
#' # | CTCF | A549 | CTCF.A549.data.rds |
#' # | CTCF | GM12878 | CTCF.GM12878.data.rds |
#' # | ... | ... | ... |
#'
#' # Assembles training data for the quantitative occupancy model,
#' # uses odd chromosomes for training, keeps at most 50000 candidate sites for
#' # each TF x cell type combination, and splits training data into 10 partitions.
#' assembled_training_data <- assemble_training_data(tf_cell_table,
#' logistic_model = FALSE,
#' chip_col = 'chip',
#' training_chrs = paste0('chr', seq(1,21,2)),
#' n_partitions=10,
#' max_sites = 50000)
#'
#' # Assembles training data for the logistic version of the model
#' assembled_training_data <- assemble_training_data(tf_cell_table,
#' logistic_model = TRUE,
#' chip_col = 'chip_label',
#' training_chrs = paste0('chr', seq(1,21,2)),
#' n_partitions=10,
#' max_sites = 50000)
#'
#' }
#'
assemble_training_data <- function(tf_cell_table,
logistic_model=FALSE,
chip_col='chip',
training_chrs=paste0('chr', seq(1,21,2)),
n_partitions=10,
n_cores=n_partitions,
max_sites=50000,
seed=1){
tf_cell_table <- as.data.frame(tf_cell_table)
if(ncol(tf_cell_table) < 3){
stop('tf_cell_table should have at least three columns! ')
}
colnames(tf_cell_table)[1:3] <- c('tf_name', 'cell_type', 'data_file')
cat('Assemble TOP training data...\n')
tf_list <- sort(unique(as.character(tf_cell_table$tf_name)))
celltype_list <- sort(unique(as.character(tf_cell_table$cell_type)))
cat('TFs:', tf_list, '\n')
cat('Cell types:', celltype_list, '\n')
tf_cell_table$tf_name <- factor(tf_cell_table$tf_name, levels = tf_list)
tf_cell_table$cell_type <- factor(tf_cell_table$cell_type, levels = celltype_list)
tf_cell_table <- tf_cell_table[with(tf_cell_table, order(tf_name, cell_type)),]
# Assemble training data for each partition
if(missing(n_cores)){
n_available_cores <- detectCores(logical = FALSE) - 1
n_cores <- min(n_available_cores, n_partitions)
}else{
n_cores <- min(n_cores, n_partitions)
}
registerDoParallel(cores=n_cores)
all_training_data <- foreach(k=1:n_partitions) %dopar% {
training_data <- assemble_partition_training_data(tf_cell_table,
logistic_model,
chip_col,
training_chrs,
n_partitions,
k,
max_sites/n_partitions,
seed)
# Add TF and cell type indices
training_data$tf_id <- as.integer(factor(training_data$tf_name, levels = tf_list))
training_data$cell_id <- as.integer(factor(training_data$cell_type, levels = celltype_list))
training_data
}
tf_cell_combos <- unique(all_training_data[[1]][, c('tf_id', 'cell_id', 'tf_name', 'cell_type')])
cat(nrow(tf_cell_combos), 'TF x cell type combinations assembled in training data. \n')
return(all_training_data)
}
# Assemble TOP training data for all TF x cell type combinations in a selected partition.
assemble_partition_training_data <- function(tf_cell_table,
logistic_model=FALSE,
chip_col='chip',
training_chrs=paste0('chr', seq(1,21,2)),
n_partitions=10,
part=1,
max_sites=5000,
seed=1) {
if(part > n_partitions || part < 1){
stop('part should be an integer less than n_partitions!')
}
cat('Assemble training data for partition', part, '... \n')
set.seed(seed)
# Load training data for each TF x cell type combo,
# split training data into partitions,
# and select the subset (part) and combine all TF x cell type combinations
assembled_trainng_data <- list()
colnames(tf_cell_table)[1:3] <- c('tf_name', 'cell_type', 'data_file')
for(i in 1:nrow(tf_cell_table)) {
tf_name <- as.character(tf_cell_table$tf_name[i])
cell_type <- as.character(tf_cell_table$cell_type[i])
data_file <- as.character(tf_cell_table$data_file[i])
if(!file.exists(data_file)){
if(part == 1){
message(sprintf('Warning: data of %s in %s cell is not available! Skipped.', tf_name, cell_type))
cat('Check data file:', data_file, '\n')
}
next
}
if(grepl('\\.rds$', data_file, ignore.case = TRUE)){
data <- as.data.frame(readRDS(data_file))
}else{
data <- as.data.frame(data.table::fread(data_file))
}
if(!chip_col %in% colnames(data)){
message(sprintf('Warning: data of %s in %s cell does not have %s column! Skipped.',
tf_name, cell_type, chip_col))
next
}else if(anyNA(data[,chip_col])){
message(sprintf('Warning: data of %s in %s cell contains missing values in %s column! Skipped.',
tf_name, cell_type, chip_col))
next
}
if(!'pwm' %in% colnames(data)){
pwm_col <- grep('pwm', colnames(data), ignore.case = TRUE)
if(length(pwm_col)==1){
colnames(data)[pwm_col] <- 'pwm'
}else{
stop(sprintf('No column of PWM scores found in data file %s \n', data_file))
}
}
if(length(grep('bin', colnames(data))) == 0){
bin_cols <- grep('dnase|atac', colnames(data), ignore.case = TRUE)
if(length(bin_cols) > 0){
colnames(data)[bin_cols] <- paste0('bin', 1:length(bin_cols))
}else{
stop(sprintf('No columns of DNase or ATAC bins found in data file %s \n', data_file))
}
}
if(logistic_model){
data <- data.frame(data[, -grep('chip', colnames(data))], chip_label = data[, chip_col])
}else{
data <- data.frame(data[, -grep('chip', colnames(data))], chip = data[, chip_col])
}
# Select the training set
training_data <- data[data$chr %in% training_chrs,]
partition_size <- ceiling(nrow(training_data) / n_partitions)
partition_groups <- as.factor(ceiling(c(1:nrow(training_data))/partition_size))
data_partitions <- split(training_data, partition_groups)
training_data_partition <- data.frame(tf_name = tf_name,
cell_type = cell_type,
data_partitions[[part]])
# Keep a smaller subset of candidate sites in training data
if(nrow(training_data_partition) > max_sites){
rows <- sample(1:nrow(training_data_partition), max_sites)
training_data_partition <- training_data_partition[rows, ]
}
assembled_trainng_data[[ paste(tf_name, cell_type, sep = '.') ]] <- training_data_partition
}
## row combine all data
assembled_trainng_data <- do.call(rbind, assembled_trainng_data)
row.names(assembled_trainng_data) <- NULL
return(assembled_trainng_data)
}