default asEnrichdat to enrichment analysis

GangLiLab · Sep 10, 2021 · 0e6cbb8 · 0e6cbb8
1 parent a629428
commit 0e6cbb8
Show file tree

Hide file tree

Showing 10 changed files with 154 additions and 145 deletions.
diff --git a/R/asEnrichdat.R b/R/asEnrichdat.R
@@ -0,0 +1,124 @@
+#' Adjust dataframe for enrichment plot
+#'
+#' make sure colname contains Description, Count, FoldEnrich/GeneRatio, pvalue/qvalue/p.adjust
+#'
+#' @param enrich_df dataframe of enrichment analysis result .
+#'
+#' @importFrom stringr str_remove_all str_split str_remove
+#' @importFrom dplyr mutate pull
+#' @return A `data.frame`.
+#' @export
+
+as.enrichdat <- function(enrich_df) {
+
+  ## get lower case colnames
+  remove <- c("\\(", "\\)", " ", "-", "_")
+  to_check <- stringr::str_remove_all(tolower(colnames(enrich_df)), paste(remove, collapse = "|"))
+
+  ## find description col
+  if (!any(grepl("description", to_check))) {
+    check_description <- apply(enrich_df, 2, function(x) all(grepl("[A-Za-z]{3,}", x)))
+    check2 <- which(check_description)
+    if (any(check2 < (ncol(enrich_df) / 2))) {
+      colnames(enrich_df)[check_description][check2 < 5] <- "Description"
+    } else {
+      stop("Not found description column!")
+    }
+  } else {
+    colnames(enrich_df)[grepl("description", to_check)] <- "Description"
+  }
+
+  ## find count col
+  # if finds genes col, calc gene num as count; else find another col as count
+  if (!any(grepl("count", to_check))) {
+    check_gene <- apply(enrich_df, 2, function(x) all(grepl("[A-Za-z]{3,}|\\/|,", x) & !grepl("tags|list", x)))
+    check2 <- which(check_gene)
+    if (any(check2 > (ncol(enrich_df) / 2))) {
+      colnames(enrich_df)[check_gene][check2 > (ncol(enrich_df) / 2)] <- "geneID"
+      gen_num <- stringr::str_split(enrich_df$geneID, ",|\\/") %>%
+        lapply(., length) %>%
+        unlist()
+      enrich_df <- enrich_df %>% dplyr::mutate(Count = gen_num)
+    } else if (any(grepl("\\([1-9]{,4}\\)", colnames(enrich_df)))) {
+      gen_num <- enrich_df[grepl("\\([1-9]{,4}\\)", colnames(enrich_df))] %>%
+        dplyr::pull(1) %>%
+        as.numeric()
+      enrich_df <- enrich_df %>% dplyr::mutate(Count = gen_num)
+    } else {
+      stop('Please rename the gene count column as "Count"!')
+      # head(enrich_df[1:2,])
+      # message("Cannot auto-select count column...","\n","Please specify the column number which includes gene count...")
+      # answer <- scan(what = "character", n =1,quiet =T)
+      # message('Choose the No. ',answer,' column as gene count...')
+      # enrich_df = enrich_df %>% dplyr::rename(Count = eval(parse(text = answer)))
+    }
+  } else {
+    colnames(enrich_df)[grepl("count", to_check)] <- "Count"
+  }
+
+  ## find FoldEnrich col
+  # GSEA result has no FoldEnrich, need to exclude
+  if (!(any(grepl("\\benrichmentscore\\b", to_check)) & any(grepl("\\bleadingedge\\b", to_check)))) {
+    if (any(grepl("foldenrich|enrichment", to_check))) {
+      colnames(enrich_df)[grepl("foldenrich|enrichment", to_check)] <- "FoldEnrich"
+    } else {
+      stop('Please rename the fold enrichment column as "FoldEnrich"!')
+      # head(enrich_df[1:2,])
+      # message("Cannot auto-select fold enrichment column...","\n",
+      #         "Please specify the column number which includes fold enrichment...")
+      # answer <- scan(what = "character", n =1,quiet =T)
+      # message('Choose the No. ',answer,' column as fold enrichment...')
+      # enrich_df = enrich_df %>% dplyr::rename(FoldEnrich = eval(parse(text = answer)))
+    }
+  }
+
+
+  ## find GeneRatio col
+  if (any(grepl("generatio", to_check))) {
+    colnames(enrich_df)[grepl("generatio", to_check)] <- "GeneRatio"
+    enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = sapply(.$GeneRatio, function(x) eval(parse(text = x))))
+  } else {
+    # gsea
+    if (any(grepl("setsize", to_check))) {
+      colnames(enrich_df)[grepl("setsize", to_check)] <- "setSize"
+      enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = as.numeric(Count) / as.numeric(setSize))
+    } else if (any(grepl("\\([1-9]{,4}\\)", colnames(enrich_df)))) {
+      # panther result
+      setsize <- colnames(enrich_df)[grepl("\\([1-9]{,4}\\)", colnames(enrich_df))] %>%
+        stringr::str_remove(., ".*\\(") %>%
+        stringr::str_remove(., "\\)") %>%
+        as.numeric()
+      enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = as.numeric(Count) / setsize)
+    } else if (apply(enrich_df, 2, function(x) length(unique(x)) == 1)) {
+      setsize <- enrich_df[1, apply(enrich_df, 2, function(x) length(unique(x)) == 1)] %>%
+        stringr::str_remove("0") %>%
+        as.numeric() %>%
+        sort() %>%
+        .[1]
+      enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = as.numeric(Count) / setsize)
+    }
+  }
+
+  ## find pvalue col
+  if (any(grepl("pvalue", to_check))) {
+    colnames(enrich_df)[grepl("pvalue", to_check)] <- "pvalue"
+  } else if (any(grepl("\\buncorrectedpvalue\\b", to_check))) {
+    colnames(enrich_df)[grepl("\\buncorrectedpvalue\\b", to_check)] <- "pvalue"
+  }
+
+  ## find p.adjust col
+  if (any(grepl("p.adjust", to_check))) {
+    colnames(enrich_df)[grepl("p.adjust", to_check)] <- "p.adjust"
+  } else if (any(grepl("\\bcorrectedpvalue\\b", to_check))) {
+    colnames(enrich_df)[grepl("\\bcorrectedpvalue\\b", to_check)] <- "p.adjust"
+  }
+
+  ## find qvalue col
+  if (any(grepl("qvalue", to_check))) {
+    colnames(enrich_df)[grepl("qvalue", to_check)] <- "qvalue"
+  } else if (any(grepl("fdr", to_check) & !grepl("fdrrate", to_check))) {
+    colnames(enrich_df)[grepl("fdr", to_check)] <- "qvalue"
+  }
+
+  return(enrich_df)
+}
diff --git a/R/genGO.R b/R/genGO.R
@@ -81,11 +81,13 @@ genGO <- function(id,
     new_ego <- ego %>%
       as.data.frame() %>%
       dplyr::mutate(geneID = new_geneID) %>%
-      calcFoldEnrich()
+      calcFoldEnrich() %>%
+      as.enrichdat()
   } else {
     new_ego <- ego %>%
       as.data.frame() %>%
-      calcFoldEnrich()
+      calcFoldEnrich() %>%
+      as.enrichdat()
   }
 
   return(new_ego)

diff --git a/R/genGSEA.R b/R/genGSEA.R
@@ -79,6 +79,6 @@ genGSEA <- function(genelist,
   #   new_egmt = egmt %>% as.data.frame()
   # }
   #
-  new_egmt <- egmt %>% as.data.frame()
+  new_egmt <- egmt %>% as.data.frame() %>% as.enrichdat()
   return(new_egmt)
 }
diff --git a/R/genKEGG.R b/R/genKEGG.R
@@ -85,11 +85,13 @@ genKEGG <- function(id,
     new_keg <- keg %>%
       as.data.frame() %>%
       dplyr::mutate(geneID = new_geneID) %>%
-      calcFoldEnrich()
+      calcFoldEnrich() %>%
+      as.enrichdat()
   } else {
     new_keg <- keg %>%
       as.data.frame() %>%
-      calcFoldEnrich()
+      calcFoldEnrich() %>%
+      as.enrichdat()
   }
 
   return(new_keg)

diff --git a/R/ploTheme.R b/R/ploTheme.R
@@ -15,7 +15,8 @@
 #' @importFrom ggplot2 theme_bw theme margin unit element_text element_rect element_line
 #' @examples
 #' library(ggplot2)
-#' ggplot(mtcars, aes(x=wt, y=mpg))+ geom_point()+ plot_theme(theme_type = 'bw')
+#' ggplot(mtcars, aes(x=wt, y=mpg))+ geom_point()+
+#'   plot_theme(theme_type = 'bw', font_type = 'Times', border_thick = 2)
 #' @export
 plot_theme <- function(theme_type = c('bw','classic'),
                        main_text_size = 14,

diff --git a/R/plotEnrichDot.R b/R/plotEnrichDot.R
@@ -2,7 +2,7 @@
 #'
 #' @param enrich_df `data.frame` of enrichment analysis result .
 #' @param xlab_type X-axis label type, one of 'GeneRatio','Count','FoldEnrich'.
-#' @param legend_by Stats legend type, one of "pvalue", "p.adjust", "qvalue".
+#' @param legend_type Stats legend type, one of "pvalue", "p.adjust", "qvalue".
 #' @param low_color Legend color for low pvalue or qvalue, default is "red".
 #' @param high_color Legend color for high pvalue or qvalue, default is "blue".
 #' @param show_item Numeric, select top N rows to show, default is 10.
@@ -27,12 +27,13 @@
 #'   org = "human", ont = "mf", pvalueCutoff = 0.05,
 #'   qvalueCutoff = 0.1, use_symbol = FALSE
 #' )
-#' plotEnrichDot(ego)
+#' plotEnrichDot(ego,remove_grid = T, main_text_size = 8,
+#'   legend_text_size = 6,border_thick = 1.5)
 #' }
 #'
 plotEnrichDot <- function(enrich_df,
                           xlab_type = c("FoldEnrich", "GeneRatio", "Count"),
-                          legend_by = c("p.adjust", "pvalue", "qvalue"),
+                          legend_type = c("p.adjust", "pvalue", "qvalue"),
                           low_color = "red",
                           high_color = "blue",
                           show_item = 10,
@@ -43,7 +44,7 @@ plotEnrichDot <- function(enrich_df,
   #--- args ---#
   stopifnot(is.numeric(show_item))
   xlab_type <- match.arg(xlab_type)
-  legend_by <- match.arg(legend_by)
+  legend_type <- match.arg(legend_type)
 
   types <- c("GeneRatio", "Count", "FoldEnrich")
   legends <- c("p.adjust", "pvalue", "qvalue")
@@ -54,9 +55,9 @@ plotEnrichDot <- function(enrich_df,
       paste(intersect(colnames(enrich_df), types), collapse = " | ")
     )
   }
-  if (!legend_by %in% colnames(enrich_df)) {
+  if (!legend_type %in% colnames(enrich_df)) {
     stop(
-      legend_by, " not included in this dataframe, try: ",
+      legend_type, " not included in this dataframe, try: ",
       paste(intersect(colnames(enrich_df), legends), collapse = " | ")
     )
   }
@@ -65,8 +66,8 @@ plotEnrichDot <- function(enrich_df,
   xlab_title <- ifelse(xlab_type == "FoldEnrich", "Fold Enrichment",
     ifelse(xlab_type == "GeneRatio", "Gene Ratio", "Count")
   )
-  legend_title <- ifelse(legend_by == "pvalue", "Pvalue",
-    ifelse(legend_by == "p.adjust", "P.adjust", "FDR")
+  legend_title <- ifelse(legend_type == "pvalue", "Pvalue",
+    ifelse(legend_type == "p.adjust", "P.adjust", "FDR")
   )
 
   if (show_item <= nrow(enrich_df)) {
@@ -83,7 +84,7 @@ plotEnrichDot <- function(enrich_df,
   #--- plot ---#
   p <- ggplot(enrich_df, aes(x = eval(parse(text = xlab_type)), y = Description)) +
     geom_point(aes(
-      color = eval(parse(text = legend_by)),
+      color = eval(parse(text = legend_type)),
       size = Count
     )) +
     scale_color_continuous(
@@ -92,7 +93,7 @@ plotEnrichDot <- function(enrich_df,
       labels = function(x) format(x, scientific = T)
     ) +
     xlab(xlab_title) +
-    labs(color = legend_by)+
+    labs(color = legend_type)+
     xlim(xlim_left,xlim_right)+
     plot_theme(...)
 
@@ -114,127 +115,3 @@ text_wraper <- function(width) {
 }
 
 
-##' Adjust dataframe for enrichment plot
-##'
-##' make sure colname contains Description, Count, FoldEnrich/GeneRatio, pvalue/qvalue/p.adjust
-##'
-##' @param enrich_df dataframe of enrichment analysis result .
-##'
-##' @importFrom stringr str_remove_all str_split str_remove
-##' @importFrom dplyr mutate pull
-##' @return A `data.frame`.
-##' @export
-
-as.enrichdat <- function(enrich_df) {
-
-  ## get lower case colnames
-  remove <- c("\\(", "\\)", " ", "-", "_")
-  to_check <- stringr::str_remove_all(tolower(colnames(enrich_df)), paste(remove, collapse = "|"))
-
-  ## find description col
-  if (!any(grepl("description", to_check))) {
-    check_description <- apply(enrich_df, 2, function(x) all(grepl("[A-Za-z]{3,}", x)))
-    check2 <- which(check_description)
-    if (any(check2 < (ncol(enrich_df) / 2))) {
-      colnames(enrich_df)[check_description][check2 < 5] <- "Description"
-    } else {
-      stop("Not found description column!")
-    }
-  } else {
-    colnames(enrich_df)[grepl("description", to_check)] <- "Description"
-  }
-
-  ## find count col
-  # if finds genes col, calc gene num as count; else find another col as count
-  if (!any(grepl("count", to_check))) {
-    check_gene <- apply(enrich_df, 2, function(x) all(grepl("[A-Za-z]{3,}|\\/|,", x) & !grepl("tags|list", x)))
-    check2 <- which(check_gene)
-    if (any(check2 > (ncol(enrich_df) / 2))) {
-      colnames(enrich_df)[check_gene][check2 > (ncol(enrich_df) / 2)] <- "geneID"
-      gen_num <- stringr::str_split(enrich_df$geneID, ",|\\/") %>%
-        lapply(., length) %>%
-        unlist()
-      enrich_df <- enrich_df %>% dplyr::mutate(Count = gen_num)
-    } else if (any(grepl("\\([1-9]{,4}\\)", colnames(enrich_df)))) {
-      gen_num <- enrich_df[grepl("\\([1-9]{,4}\\)", colnames(enrich_df))] %>%
-        dplyr::pull(1) %>%
-        as.numeric()
-      enrich_df <- enrich_df %>% dplyr::mutate(Count = gen_num)
-    } else {
-      stop('Please rename the gene count column as "Count"!')
-      # head(enrich_df[1:2,])
-      # message("Cannot auto-select count column...","\n","Please specify the column number which includes gene count...")
-      # answer <- scan(what = "character", n =1,quiet =T)
-      # message('Choose the No. ',answer,' column as gene count...')
-      # enrich_df = enrich_df %>% dplyr::rename(Count = eval(parse(text = answer)))
-    }
-  } else {
-    colnames(enrich_df)[grepl("count", to_check)] <- "Count"
-  }
-
-  ## find FoldEnrich col
-  # GSEA result has no FoldEnrich, need to exclude
-  if (!(any(grepl("\\benrichmentscore\\b", to_check)) & any(grepl("\\bleadingedge\\b", to_check)))) {
-    if (any(grepl("foldenrich|enrichment", to_check))) {
-      colnames(enrich_df)[grepl("foldenrich|enrichment", to_check)] <- "FoldEnrich"
-    } else {
-      stop('Please rename the fold enrichment column as "FoldEnrich"!')
-      # head(enrich_df[1:2,])
-      # message("Cannot auto-select fold enrichment column...","\n",
-      #         "Please specify the column number which includes fold enrichment...")
-      # answer <- scan(what = "character", n =1,quiet =T)
-      # message('Choose the No. ',answer,' column as fold enrichment...')
-      # enrich_df = enrich_df %>% dplyr::rename(FoldEnrich = eval(parse(text = answer)))
-    }
-  }
-
-
-  ## find GeneRatio col
-  if (any(grepl("generatio", to_check))) {
-    colnames(enrich_df)[grepl("generatio", to_check)] <- "GeneRatio"
-    enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = sapply(.$GeneRatio, function(x) eval(parse(text = x))))
-  } else {
-    # gsea
-    if (any(grepl("setsize", to_check))) {
-      colnames(enrich_df)[grepl("setsize", to_check)] <- "setSize"
-      enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = as.numeric(Count) / as.numeric(setSize))
-    } else if (any(grepl("\\([1-9]{,4}\\)", colnames(enrich_df)))) {
-      # panther result
-      setsize <- colnames(enrich_df)[grepl("\\([1-9]{,4}\\)", colnames(enrich_df))] %>%
-        stringr::str_remove(., ".*\\(") %>%
-        stringr::str_remove(., "\\)") %>%
-        as.numeric()
-      enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = as.numeric(Count) / setsize)
-    } else if (apply(enrich_df, 2, function(x) length(unique(x)) == 1)) {
-      setsize <- enrich_df[1, apply(enrich_df, 2, function(x) length(unique(x)) == 1)] %>%
-        stringr::str_remove("0") %>%
-        as.numeric() %>%
-        sort() %>%
-        .[1]
-      enrich_df <- enrich_df %>% dplyr::mutate(GeneRatio = as.numeric(Count) / setsize)
-    }
-  }
-
-  ## find pvalue col
-  if (any(grepl("pvalue", to_check))) {
-    colnames(enrich_df)[grepl("pvalue", to_check)] <- "pvalue"
-  } else if (any(grepl("\\buncorrectedpvalue\\b", to_check))) {
-    colnames(enrich_df)[grepl("\\buncorrectedpvalue\\b", to_check)] <- "pvalue"
-  }
-
-  ## find p.adjust col
-  if (any(grepl("p.adjust", to_check))) {
-    colnames(enrich_df)[grepl("p.adjust", to_check)] <- "p.adjust"
-  } else if (any(grepl("\\bcorrectedpvalue\\b", to_check))) {
-    colnames(enrich_df)[grepl("\\bcorrectedpvalue\\b", to_check)] <- "p.adjust"
-  }
-
-  ## find qvalue col
-  if (any(grepl("qvalue", to_check))) {
-    colnames(enrich_df)[grepl("qvalue", to_check)] <- "qvalue"
-  } else if (any(grepl("fdr", to_check) & !grepl("fdrrate", to_check))) {
-    colnames(enrich_df)[grepl("fdr", to_check)] <- "qvalue"
-  }
-
-  return(enrich_df)
-}
diff --git a/R/utils_tool.R b/R/utils_tool.R
@@ -248,3 +248,4 @@ calcFoldEnrich <- function(df) {
   }
   return(df)
 }
+
diff --git a/man/as.enrichdat.Rd b/man/as.enrichdat.Rd