In [1]:
###################################################################################################
###                             Motif Enrichment                                ###
###################################################################################################

#load libraries
library(tidyverse)
library(devtools)
library(motifmatchr)
library(BiocParallel)
load_all('/home/jpm73279/genome_downloads/BS_genomes/BSgenome.Zm_B73')
library(Matrix)
library(GenomicAlignments)
library(dplyr)
library(universalmotif)  # manipulating motif representations



── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2          [32m✔[39m [34mreadr    [39m 2.1.4     
[32m✔[39m [34mforcats  [39m 1.0.0.[31m9000[39m     [32m✔[39m [34mstringr  [39m 1.5.1     
[32m✔[39m [34mggplot2  [39m 3.4.4          [32m✔[39m [34mtibble   [39m 3.2.1     
[32m✔[39m [34mlubridate[39m 1.9.2          [32m✔[39m [34mtidyr    [39m 1.3.0     
[32m✔[39m [34mpurrr    [39m 1.0.2          
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
“package ‘devtools’ was built under R version 4.3.1”
Loading required package: usethis



[1m[22m[36mℹ[39m Loa

In [2]:
### Load Motifs for Analysis
core_motifs <- read_meme("/scratch/jpm73279/comparative_single_cell/07.call.ACRs/00.data/conserved_consensus_motifs_Fig2B.meme")
de_novo_motifs <- read_meme("/home/jpm73279/genome_downloads/C4_markers/found_motifs/combined_unique_motifs.meme")
ms_de_novo_motifs <- read_meme("/home/jpm73279/genome_downloads/C4_markers/found_motifs/Ms.specific_CGTCGT_motif.comparative.meme")

all_assy_motifs <- c(core_motifs, de_novo_motifs, ms_de_novo_motifs)




all_assy_motifs_converted <- convert_motifs(all_assy_motifs, class = "TFBSTools-PWMatrix")

update_name <- function(pwm) {
    if (!is.null(pwm@name)) {
        pwm@ID <- pwm@name
    }
    return(pwm)
}

library(TFBSTools)
pw_matrix_list_converted <- lapply(all_assy_motifs_converted, update_name)
pw_matrix_list_converted <- lapply(pw_matrix_list_converted, PWMatrixList)
pw_matrix_list_converted <- do.call(c, pw_matrix_list_converted)


Attaching package: ‘TFBSTools’


The following object is masked from ‘package:Matrix’:

    Matrix




In [3]:
marker_file_paths <- c("/home/jpm73279/genome_downloads/C4_markers/Zm.c4_markers.bed")

# Function to read and process the file
process_file <- function(file_path) {
  species <- substr(basename(file_path), 1, 2)
  read_delim(file_path, delim = "\t", col_names = c("chrom", "start", "end", "geneID", "name", "type")) %>% 
    dplyr::select("chrom","start","end","geneID","name","type") %>%
    dplyr::mutate(species = (species))
}

# Apply the function to each file and store results in a list
list_of_dataframes <- lapply(marker_file_paths, process_file)

# Optionally combine all dataframes into one if needed
markers <- bind_rows(list_of_dataframes) %>% 
    dplyr::select(geneID,name,type,species)

[1mRows: [22m[34m33[39m [1mColumns: [22m[34m6[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (4): chrom, geneID, name, type
[32mdbl[39m (2): start, end

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [4]:
process_acr_data <- function(file_path, species_abbr, cell_type_filter, markers, genome) {
    # Read and annotate the data
    annotated_data <- readr::read_delim(file_path, delim = "\t", col_names = FALSE, col_types = "cccccccccc") %>%
        dplyr::mutate(species = species_abbr)

    # Combine and process data
  combined_acrs_species <- annotated_data %>%
        dplyr::ungroup() %>%
         dplyr::left_join(., markers, by = c("species", "X5" = "name"))  %>%
        dplyr::rename(name = X5) %>% 
        dplyr::rename("enzyme_location" = type, 
                      "locus" = name, 
                      "acr_name_type" = X9) %>%
        tidyr::separate(acr_name_type, into = c("acr", "acr_cell_type_specific_class"), sep = ";") %>%
        dplyr::mutate(acr_cell_type_specific_class = dplyr::case_when(
            acr_cell_type_specific_class == "bundle_sheath,procambial_meristem" ~ "bundle_sheath",
            acr_cell_type_specific_class == "bundle_sheath,procambium" ~ "bundle_sheath", 
            acr_cell_type_specific_class == "mesophyll,bundle_sheath" ~ "mesophyll_bundle_sheath",
            TRUE ~ acr_cell_type_specific_class)) %>%
        tidyr::separate(locus, into = c("species_other", "loci"), sep = "__", remove = FALSE) %>%
        tidyr::separate(loci, into = c("gene_family", "number"), sep = "_", remove = FALSE)

    # Duplicate rows with "mesophyll,bundle_sheath" and assign individual classes
    mesophyll <- combined_acrs_species %>%
        dplyr::filter(acr_cell_type_specific_class == "mesophyll_bundle_sheath") %>%
        dplyr::mutate(acr_cell_type_specific_class = "mesophyll")
    
    bundle_sheath <- combined_acrs_species %>%
        dplyr::filter(acr_cell_type_specific_class == "mesophyll_bundle_sheath") %>%
        dplyr::mutate(acr_cell_type_specific_class = "bundle_sheath")


    combined_acrs_species <- combined_acrs_species %>%
        dplyr::filter(acr_cell_type_specific_class != "mesophyll_bundle_sheath") %>%
        dplyr::bind_rows(mesophyll,bundle_sheath) %>%
        dplyr::filter(!is.na(locus)) %>%
        dplyr::filter(acr_cell_type_specific_class %in% cell_type_filter) %>% 
        dplyr::rename(chr = X6,
                 start = X7,
                 end = X8,
                 acr_id = acr,
                      score = acr_cell_type_specific_class)

    # Generate GRanges object
    granges_data <-  GRanges(combined_acrs_species, seqinfo=seqinfo(genome))


    return(granges_data)
}



# file_path <- "/scratch/jpm73279/comparative_single_cell/07.call.ACRs/10.C4_gene_relationship_only_single_gene/all_acr_intersections/zm.extended_c4_genes.acr_intersection.all.bed"  # Replace with your file path
# species_abbr <- "Zm"                   # Replace with your species abbreviation
# cell_type_filter <- c("broadly_accessible")  # Replace with your cell type filter

# zm_acrs_cts_gr <- process_acr_data(file_path, species_abbr, cell_type_filter, markers)


In [5]:
process_and_convert_conservation_data <- function(file_path, species, filter_acr_ids, genome) {
  # Read and process the file
  conservation_data <- readr::read_delim(file_path, 
             col_names = c("qChr", "qStart", "qEnd", "acr_name", "pval", "refFile", "rChr", "rStart", "rEnd", "rName", "rWidth", "strand"),
             col_types = "cccccccccccc") %>%
    dplyr::mutate(species = species) %>%
    tidyr::separate(acr_name, into = c("acr", "acr_cell_type_specific_class", "locus"), sep = ";") %>%
    dplyr::mutate(
      refFile = stringr::str_replace(refFile, "\\.all_combined_regions\\.passing_regions\\.blast_passing_regions\\.intersecting_regions\\.ref\\.frac\\.bed", ""),
      refFile = stringr::str_replace(refFile, ".*\\.vs\\.", "")
    ) %>%
    dplyr::filter(rName != ".") %>%
    tidyr::separate(rName, into = c("RefFrom", "racr_name", "rcell_type", "gene_family", "gene_family2"), sep = ";") %>%
    dplyr::select(acr, acr_cell_type_specific_class, locus, racr_name, rcell_type, gene_family, rWidth, species, rChr, rStart, rEnd) %>%
    tidyr::separate(racr_name, into = c("rspecies", "racr"), sep = "_(.*)", extra = "merge", remove = FALSE) %>%
    dplyr::mutate(acr_cell_type_specific_class = case_when(acr_cell_type_specific_class == "bundle_sheath,procambial_meristem" ~ "bundle_sheath",
                                                          acr_cell_type_specific_class == "bundle_sheath,procambium" ~ "bundle_sheath", 
                                                          TRUE ~ acr_cell_type_specific_class)) %>%
    dplyr::mutate(qacr = stringr::str_c(species, acr, sep = "_"))

  # Convert to GRanges
    convert_to_conservation_granges <- conservation_data %>% 
        dplyr::rename(chr = rChr,
                 start = rStart,
                 end = rEnd,
                 acr_id = acr,
                score = rWidth) %>% 
    #filter for only the ACRs which are CTS from the above analysis
        dplyr::filter(acr_id %in% filter_acr_ids)

    conservation_granges <- GRanges(convert_to_conservation_granges, seqinfo=seqinfo(genome))


  return(conservation_granges)
}

# conserved_regions_granges <- process_and_convert_conservation_data("/home/jpm73279/genome_downloads/C4_markers/conservation/Zm.all.ACR_conservation.intersection.bed",
#                                      "Zm",
#                                       filter_acrs_cell_type_specifc$acr,
#                                     BSgenome.Zm_B73)

In [15]:
library(GenomicRanges)

process_and_merge_granges <- function(conserved_gr, reference_gr) {
    # Combine and reduce conserved GRanges
    conserved_gr_merged <- GenomicRanges::reduce(conserved_gr)

    # Split conserved GRanges by acr_id and reduce each subset
    split_granges <- split(conserved_gr, conserved_gr$acr_id)
    reduced_granges_list <- lapply(split_granges, GenomicRanges::reduce)

    # Reinitialize conserved_gr_merged
    conserved_gr_merged <- GRanges()

    # Loop over the list and assign names as metadata
    for (name in names(reduced_granges_list)) {
        gr <- reduced_granges_list[[name]]
        mcols(gr)$acr_ID <- name
        conserved_gr_merged <- c(conserved_gr_merged, gr)
    }

    # Annotate the conserved regions
    overlaps <- GenomicRanges::findOverlaps(conserved_gr_merged, reference_gr)

    # Extract overlapping indices
    overlapping_indices <- subjectHits(overlaps)
    overlapping_indices <- unique(overlapping_indices) # Ensure uniqueness

    # Ensure that overlapping indices are within the bounds
    if (any(overlapping_indices > length(conserved_gr_merged))) {
        # Filter out invalid indices
        overlapping_indices <- overlapping_indices[overlapping_indices <= length(conserved_gr_merged)]
    }

    # Annotate the conserved regions
    if (length(overlapping_indices) > 0) {
        conserved_gr_merged <- conserved_gr_merged[overlapping_indices]
        mcols(conserved_gr_merged) <- mcols(reference_gr)[overlapping_indices, ]
        conserved_gr_merged$conserved_regions <- "conserved"
    } else {
        conserved_gr_merged <- GRanges() # Empty GRanges if no valid overlaps
    }

    # Annotate the Non-Conserved Regulatory Regions
    results <- GenomicRanges::setdiff(reference_gr, conserved_gr_merged, ignore.strand = TRUE)
    if (length(results) > 0) {
        revmap <- GenomicRanges::findOverlaps(results, reference_gr, select="arbitrary")
        mcols(results) <- mcols(reference_gr)[revmap, , drop=FALSE]
        results$conserved_regions <- "non_conserved"
    }

    # Assign unique IDs to all regions
    all_regions <- c(conserved_gr_merged, results)
    all_regions$unique_id <- seq_along(all_regions)

    return(all_regions)
}


In [71]:
match_and_annotate_motifs <- function(pwm_matrix_list, merged_regions, genome, p_cutoff) {
    
    extractIDDataFrame <- function(pwMatrixList) {
          ids <- sapply(pwMatrixList@listData, function(item) item@ID)
          ids <- gsub("chr", "", ids, fixed = TRUE)
          indices <- seq_along(ids)
  
          data.frame(group = indices, TF_ID = ids)
    }
    motif_id_index <- extractIDDataFrame(pwm_matrix_list)

                
    # Matching motifs
    matches_grl <- matchMotifs(
        pwm_matrix_list, 
        merged_regions, 
        genome = genome, 
        p.cutoff = p_cutoff, 
        out = "positions"
    )

    # Annotating each GRanges object in the list with the corresponding TF ID
 motif_IDs <- motif_id_index$TF_ID
    for (i in seq_along(matches_grl)) {
        overlaps <- GenomicRanges::findOverlaps(matches_grl[[i]], merged_regions)
        overlap_indices <- subjectHits(overlaps)
        overlap_indices_query <- queryHits(overlaps)

        if (length(overlap_indices) == length(matches_grl[[i]])) {
            mcols(matches_grl[[i]]) <- mcols(merged_regions)[overlap_indices, ]
        } else {
            # Subset matches_grl[[i]] to include only overlapping ranges
            if (length(overlap_indices_query) > 0) {
                subset_matches <- matches_grl[[i]][overlap_indices_query]

                # Transfer metadata columns to the subset
                mcols(subset_matches) <- mcols(merged_regions)[overlap_indices, ]

                # Update matches_grl[[i]] with the subset
                matches_grl[[i]] <- subset_matches
            } else {
                # If no elements to subset, set matches_grl[[i]] to a GRanges object with no ranges
                matches_grl[[i]] <- GRanges()
            }
        }

        # Assign TF_ID only if matches_grl[[i]] is not empty
        if (length(matches_grl[[i]]) > 0) {
            matches_grl[[i]]$TF_ID <- motif_IDs[i]
        }
    }
                        
                        
                        
    # Combine all matched regions into a single GRanges object
    matched_regions <- do.call(c, matches_grl)

    # Find and annotate regions without motifs
    non_matched_regions <- setdiff(merged_regions, matched_regions)

    # Transfer metadata from merged_regions to non_matched_regions
    overlap_non_matched <- findOverlaps(non_matched_regions, merged_regions)

    # Create a new GRanges object for non_matched_regions with correct metadata
    if (length(overlap_non_matched) > 0) {
        # Extract the metadata for the overlapping regions
        non_matched_metadata <- mcols(merged_regions)[subjectHits(overlap_non_matched), ]

        # Create a new GRanges object with the correct metadata
        non_matched_regions_with_metadata <- non_matched_regions[queryHits(overlap_non_matched)]
        mcols(non_matched_regions_with_metadata) <- non_matched_metadata

        # Annotate TF_ID
        non_matched_regions_with_metadata$TF_ID <- "No Motif"

        # Combine the non-matched regions with and without metadata
        non_matched_regions <- c(non_matched_regions_with_metadata, non_matched_regions[setdiff(seq_along(non_matched_regions), queryHits(overlap_non_matched))])
        non_matched_regions$TF_ID[is.na(non_matched_regions$TF_ID)] <- "No Motif"
    } else {
        # If no overlaps, just annotate TF_ID
        non_matched_regions$TF_ID <- "No Motif"
    }
                        
                        
    final_regions <- c(matched_regions, non_matched_regions)
                        
                        
    return(matches_grl)
}


In [108]:

library(plyranges)
prepare_ranges_for_analysis <- function(granges_list_object, class_acrs) {

    tf_counts <- as.data.frame(granges_list_object) %>% 
        mutate(acr_class_type = class_acrs)
    
    return(tf_counts)

}

In [9]:
set.seed(1234)

In [10]:
acc_acrs_file <- "/scratch/jpm73279/comparative_single_cell/07.call.ACRs/10.C4_gene_relationship_only_single_gene/all_acr_intersections/zm.extended_c4_genes.acr_intersection.all.bed"  # Replace with your file path
conserved_acr_file <- "/home/jpm73279/genome_downloads/C4_markers/conservation/Zm.all.ACR_conservation.intersection.bed"
species_abbr <- "Zm"                   # Replace with your species abbreviation
broadly_acc <- c("broadly_accessible")  # Replace with your cell type filter
ms_acc <- c("mesophyll")  # Replace with your cell type filter
bs_acc <- c("bundle_sheath")  # Replace with your cell type filter
genome <- BSgenome.Zm_B73 

broad_acc <- process_acr_data(acc_acrs_file, 
                                   species_abbr, 
                                   broadly_acc, 
                                   markers,
                                    genome)

In [11]:
broad_acrs_conserved_regions_granges <- process_and_convert_conservation_data(conserved_acr_file,
                                     "Zm",
                                      broad_acc$acr_id,
                                      genome)

In [16]:
broad_acc_merged_regions <- process_and_merge_granges(broad_acrs_conserved_regions_granges, broad_acc)

In [95]:
library(GenomicRanges)
library(dplyr)

match_and_annotate_motifs_with_metadata <- function(pwm_matrix_list, merged_regions, genome, p_cutoff) {
    extractIDDataFrame <- function(pwMatrixList) {
        ids <- sapply(pwMatrixList@listData, function(item) item@ID)
        ids <- gsub("chr", "", ids, fixed = TRUE)
        indices <- seq_along(ids)
        data.frame(group = indices, TF_ID = ids)
    }
    motif_id_index <- extractIDDataFrame(pwm_matrix_list)

    # Matching motifs
    matches_grl <- matchMotifs(
        pwm_matrix_list, 
        merged_regions, 
        genome = genome, 
        p.cutoff = p_cutoff, 
        out = "positions"
    )

    motif_IDs <- motif_id_index$TF_ID

    # Initialize a list to store counts per region
    motif_counts_per_region <- vector("list", length(merged_regions))

    # Iterate through each region
    for (j in seq_along(merged_regions)) {
        region <- merged_regions[j]

        # Count motifs for each region
        motif_counts <- sapply(seq_along(matches_grl), function(i) {
            overlaps <- findOverlaps(matches_grl[[i]], region)
            length(unique(queryHits(overlaps)))
        })

        # Combine motif IDs with their counts
        motif_counts_per_type <- setNames(motif_counts, motif_IDs)
        motif_counts_per_region[[j]] <- motif_counts_per_type
    }

    # Create a data frame from the motif counts list
    motif_counts_per_region_df <- do.call(rbind, motif_counts_per_region)
    rownames(motif_counts_per_region_df) <- paste("Region", seq_along(motif_counts_per_region), sep="_")

    # Extract metadata from merged_regions and convert to a data frame
    metadata_df <- as.data.frame(mcols(merged_regions))

    # Combine the motif counts with the metadata
    final_data <- cbind(metadata_df, motif_counts_per_region_df)

    return(final_data)
}


In [97]:
broad_acc_motif_counts <- match_and_annotate_motifs_with_metadata(pw_matrix_list_converted, 
                                             broad_acc_merged_regions, 
                                             genome,
                                             .0005)

In [103]:

mesophyll_acc <- process_acr_data(acc_acrs_file, 
                                   species_abbr, 
                                   ms_acc, 
                                   markers,
                                 genome)

mesophyll_acrs_conserved_regions_granges <- process_and_convert_conservation_data(conserved_acr_file,
                                     "Zm",
                                      mesophyll_acc$acr_id,
                                      genome)

mesophyll_acc_merged_regions <- process_and_merge_granges(mesophyll_acrs_conserved_regions_granges, mesophyll_acc)

mesophyll_acc_motif_locations <- match_and_annotate_motifs_with_metadata(pw_matrix_list_converted, 
                                             mesophyll_acc_merged_regions, 
                                             genome,
                                             .0005)         


In [104]:
bundle_sheath_acc <- process_acr_data(acc_acrs_file, 
                                   species_abbr, 
                                   bs_acc, 
                                   markers,
                                     genome)

bundle_sheath_acrs_conserved_regions_granges <- process_and_convert_conservation_data(conserved_acr_file,
                                     "Zm",
                                      bundle_sheath_acc$acr_id,
                                      genome)

bundle_sheath_acc_merged_regions <- process_and_merge_granges(bundle_sheath_acrs_conserved_regions_granges, bundle_sheath_acc)

bundle_sheath_acc_motif_locations <- match_and_annotate_motifs_with_metadata(pw_matrix_list_converted, 
                                             bundle_sheath_acc_merged_regions, 
                                             BSgenome.Zm_B73,
                                             .0005)



In [109]:

broad_acc_final <- prepare_ranges_for_analysis(broad_acc_motif_locations, "broadly_accessible")
meso_acc_final <- prepare_ranges_for_analysis(mesophyll_acc_motif_locations, "mesophyll")
bs_acc_final <- prepare_ranges_for_analysis(bundle_sheath_acc_motif_locations, "bundle_sheath")


combined_vals <- bind_rows(broad_acc_final,meso_acc_final,bs_acc_final)


In [110]:
write_delim(combined_vals, "/home/jpm73279/genome_downloads/C4_markers/count_motifs/Zm.counted_motifs.region_aware.tsv",
           delim = "\t")