R/assemble_training_data.R


#' @title Assembles TOP training data for all TF x cell type combinations,
#' then split training data into 10 partitions
#' @description Prepares the training data for fitting TOP models.
#' It splits training data into 10 partitions and
#' assembles training data for all TF x cell type combinations
#' for each of the partitions.
#'
#' @param tf_cell_table A data frame listing all TF x cell type combinations
#' and the training data for each combination.
#' It should have at least three columns, with:
#' TF names, cell types, and file names of the individual training data for
#' each TF x cell type combination. The individual training data
#' should be in .rds or text (.txt, or .csv) format.
#' @param logistic_model Logical. If \code{logistic_model = TRUE},
#' prepare assembled data for the logistic version of TOP model.
#' If \code{logistic_model = FALSE}, prepare assembled data for the
#' quantitative occupancy model (default).
#' @param chip_col The column name of ChIP data in the individual training data
#' (default: \sQuote{chip}).
#' @param training_chrs Chromosomes used for training the model
#' (default: odd chromosomes, chr1, chr3, ..., chr21)
#' @param n_partitions Number of partitions to split the training data (default: 10).
#' @param n_cores Number of cores to run in parallel
#' (default: equal to \code{n_partitions}).
#' @param max_sites Max number of candidate sites to keep for
#' each TF x cell type combination (default: 50000). To reduce computation time,
#' randomly select \code{max_sites} candidate sites for
#' each TF x cell type combination, if the number of candidate sites
#' exceeds \code{max_sites}.
#' @param seed A number for the seed used when sampling sites.
#' @return A list of data frames (default: 10),
#' each containing one partition of the
#' training data with all TF x cell type combinations.
#' @import doParallel
#' @import foreach
#' @importFrom parallel detectCores
#'
#' @export
#' @examples
#' \dontrun{
#'
#' #  tf_cell_table should have three columns with:
#' #  TF names, cell types, and paths to the training data files, like:
#' #  |   tf_name    |   cell_type   |        data_file         |
#' #  |:------------:|:-------------:|:------------------------:|
#' #  |     CTCF     |     K562      |   CTCF.K562.data.rds     |
#' #  |     CTCF     |     A549      |   CTCF.A549.data.rds     |
#' #  |     CTCF     |    GM12878    |   CTCF.GM12878.data.rds  |
#' #  |     ...      |     ...       |   ...                    |
#'
#' # Assembles training data for the quantitative occupancy model,
#' # uses odd chromosomes for training, keeps at most 50000 candidate sites for
#' # each TF x cell type combination, and splits training data into 10 partitions.
#' assembled_training_data <- assemble_training_data(tf_cell_table,
#'                                                   logistic_model = FALSE,
#'                                                   chip_col = 'chip',
#'                                                   training_chrs = paste0('chr', seq(1,21,2)),
#'                                                   n_partitions=10,
#'                                                   max_sites = 50000)
#'
#' # Assembles training data for the logistic version of the model
#' assembled_training_data <- assemble_training_data(tf_cell_table,
#'                                                   logistic_model = TRUE,
#'                                                   chip_col = 'chip_label',
#'                                                   training_chrs = paste0('chr', seq(1,21,2)),
#'                                                   n_partitions=10,
#'                                                   max_sites = 50000)
#'
#' }
#'
assemble_training_data <- function(tf_cell_table,
                                   logistic_model=FALSE,
                                   chip_col='chip',
                                   training_chrs=paste0('chr', seq(1,21,2)),
                                   n_partitions=10,
                                   n_cores=n_partitions,
                                   max_sites=50000,
                                   seed=1){

  tf_cell_table <- as.data.frame(tf_cell_table)

  if(ncol(tf_cell_table) < 3){
    stop('tf_cell_table should have at least three columns! ')
  }
  colnames(tf_cell_table)[1:3] <- c('tf_name', 'cell_type', 'data_file')

  cat('Assemble TOP training data...\n')
  tf_list <- sort(unique(as.character(tf_cell_table$tf_name)))
  celltype_list <- sort(unique(as.character(tf_cell_table$cell_type)))
  cat('TFs:', tf_list, '\n')
  cat('Cell types:', celltype_list, '\n')

  tf_cell_table$tf_name <- factor(tf_cell_table$tf_name, levels = tf_list)
  tf_cell_table$cell_type <- factor(tf_cell_table$cell_type, levels = celltype_list)
  tf_cell_table <- tf_cell_table[with(tf_cell_table, order(tf_name, cell_type)),]

  # Assemble training data for each partition
  if(missing(n_cores)){
    n_available_cores <- detectCores(logical = FALSE) - 1
    n_cores <- min(n_available_cores, n_partitions)
  }else{
    n_cores <- min(n_cores, n_partitions)
  }

  registerDoParallel(cores=n_cores)

  all_training_data <- foreach(k=1:n_partitions) %dopar% {
    training_data <- assemble_partition_training_data(tf_cell_table,
                                                      logistic_model,
                                                      chip_col,
                                                      training_chrs,
                                                      n_partitions,
                                                      k,
                                                      max_sites/n_partitions,
                                                      seed)
    # Add TF and cell type indices
    training_data$tf_id <- as.integer(factor(training_data$tf_name, levels = tf_list))
    training_data$cell_id <- as.integer(factor(training_data$cell_type, levels = celltype_list))

    training_data
  }

  tf_cell_combos <- unique(all_training_data[[1]][, c('tf_id', 'cell_id', 'tf_name', 'cell_type')])
  cat(nrow(tf_cell_combos), 'TF x cell type combinations assembled in training data. \n')

  return(all_training_data)
}


# Assemble TOP training data for all TF x cell type combinations in a selected partition.
assemble_partition_training_data <- function(tf_cell_table,
                                             logistic_model=FALSE,
                                             chip_col='chip',
                                             training_chrs=paste0('chr', seq(1,21,2)),
                                             n_partitions=10,
                                             part=1,
                                             max_sites=5000,
                                             seed=1) {

  if(part > n_partitions || part < 1){
    stop('part should be an integer less than n_partitions!')
  }
  cat('Assemble training data for partition', part, '... \n')

  set.seed(seed)

  # Load training data for each TF x cell type combo,
  # split training data into partitions,
  # and select the subset (part) and combine all TF x cell type combinations
  assembled_trainng_data <- list()

  colnames(tf_cell_table)[1:3] <- c('tf_name', 'cell_type', 'data_file')

  for(i in 1:nrow(tf_cell_table)) {

    tf_name <- as.character(tf_cell_table$tf_name[i])
    cell_type <- as.character(tf_cell_table$cell_type[i])
    data_file <- as.character(tf_cell_table$data_file[i])

    if(!file.exists(data_file)){
      if(part == 1){
        message(sprintf('Warning: data of %s in %s cell is not available! Skipped.', tf_name, cell_type))
        cat('Check data file:', data_file, '\n')
      }
      next
    }

    if(grepl('\\.rds$', data_file, ignore.case = TRUE)){
      data <- as.data.frame(readRDS(data_file))
    }else{
      data <- as.data.frame(data.table::fread(data_file))
    }

    if(!chip_col %in% colnames(data)){
      message(sprintf('Warning: data of %s in %s cell does not have %s column! Skipped.',
                      tf_name, cell_type, chip_col))
      next
    }else if(anyNA(data[,chip_col])){
      message(sprintf('Warning: data of %s in %s cell contains missing values in %s column! Skipped.',
                      tf_name, cell_type, chip_col))
      next
    }

    if(!'pwm' %in% colnames(data)){
      pwm_col <- grep('pwm', colnames(data), ignore.case = TRUE)
      if(length(pwm_col)==1){
        colnames(data)[pwm_col] <- 'pwm'
      }else{
        stop(sprintf('No column of PWM scores found in data file %s \n', data_file))
      }
    }

    if(length(grep('bin', colnames(data))) == 0){
      bin_cols <- grep('dnase|atac', colnames(data), ignore.case = TRUE)
      if(length(bin_cols) > 0){
        colnames(data)[bin_cols] <- paste0('bin', 1:length(bin_cols))
      }else{
        stop(sprintf('No columns of DNase or ATAC bins found in data file %s \n', data_file))
      }
    }

    if(logistic_model){
      data <- data.frame(data[, -grep('chip', colnames(data))], chip_label = data[, chip_col])
    }else{
      data <- data.frame(data[, -grep('chip', colnames(data))], chip = data[, chip_col])
    }

    # Select the training set
    training_data <- data[data$chr %in% training_chrs,]
    partition_size <- ceiling(nrow(training_data) / n_partitions)
    partition_groups <- as.factor(ceiling(c(1:nrow(training_data))/partition_size))
    data_partitions <- split(training_data, partition_groups)
    training_data_partition <- data.frame(tf_name = tf_name,
                                          cell_type = cell_type,
                                          data_partitions[[part]])

    # Keep a smaller subset of candidate sites in training data
    if(nrow(training_data_partition) > max_sites){
      rows <- sample(1:nrow(training_data_partition), max_sites)
      training_data_partition <- training_data_partition[rows, ]
    }
    assembled_trainng_data[[ paste(tf_name, cell_type, sep = '.') ]] <- training_data_partition

  }

  ## row combine all data
  assembled_trainng_data <- do.call(rbind, assembled_trainng_data)
  row.names(assembled_trainng_data) <- NULL

  return(assembled_trainng_data)

}