add new file

GangLiLab · Sep 6, 2021 · 1a8e245 · 1a8e245
1 parent 419ef79
commit 1a8e245
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 20 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,6 +29,7 @@ Imports:
 Suggests:
     BiocManager,
     cowplot,
+    DOSE,
     data.table,
     easyPubMed,
     futile.logger,

diff --git a/NAMESPACE b/NAMESPACE
@@ -16,7 +16,6 @@ importFrom(clusterProfiler,GSEA)
 importFrom(clusterProfiler,enrichGO)
 importFrom(clusterProfiler,enrichKEGG)
 importFrom(dplyr,"%>%")
-importFrom(dplyr,add_row)
 importFrom(dplyr,all_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
@@ -30,7 +29,6 @@ importFrom(dplyr,na_if)
 importFrom(dplyr,pull)
 importFrom(dplyr,relocate)
 importFrom(dplyr,select)
-importFrom(dplyr,slice)
 importFrom(dplyr,slice_head)
 importFrom(dplyr,summarize)
 importFrom(ggplot2,aes)

diff --git a/R/genInfo.R b/R/genInfo.R
@@ -2,19 +2,20 @@
 #'
 #' @param id Gene id (symbol, ensembl or entrez id) or uniprot id.
 #' @param org Species name from `biocOrg_name`, both full name and short name are fine.
-#' @param unique Logical to keep only one unique mapped ID, default is FALSE.
+#' @param unique Logical to keep only one matched ID, default is FALSE.
 #' @importFrom stringr str_detect
-#' @importFrom dplyr %>% filter relocate select mutate mutate_all na_if slice add_row
+#' @importFrom dplyr %>% filter relocate select mutate mutate_all na_if
 #' @importFrom tidyr unnest
 #'
 #' @return A `data.frame`.
 #' @export
 #'
 #' @examples
+#' # input id contains fake id and one-to-many match id
 #' x <- genInfo(id = c(
 #'   "MCM10", "CDC20", "S100A9", "MMP1", "BCC7",
-#'   "FAKEID", "TP53", "HBD", "NUDT10"),
-#'   org = "hg", unique = FALSE)
+#'   "FAKEID", "TP53", "HBD", "NUDT10"
+#' ), org = "hg", unique = FALSE)
 #' head(x)
 genInfo <- function(id,
                     org,
@@ -36,17 +37,19 @@ genInfo <- function(id,
   tmp1 <- data.frame(input_id = id)
   tmp2 <- all %>% dplyr::filter(eval(parse(text = keytype)) %in% id)
 
+  tmp3  =tmp2 %>% dplyr::select(-c('symbol','uniprot')) %>% apply(.,1,is.na)
+
   ## keep each id even has no info
   # only symbol id needs to consider alias
   if (keytype != "symbol") {
     gene_info <- merge(tmp1, tmp2, by.x = "input_id", by.y = keytype, all.x = T)
-  } else {
+  } else if(any(apply(tmp3, 2, sum)  == nrow(tmp3))){
+    # if only symbol and uniprot are not NA
     tmp2 <- tmp2 %>% dplyr::filter_at(dplyr::vars(-symbol, -uniprot), dplyr::all_vars(!is.na(.)))
     gene_info <- merge(tmp1, tmp2, by.x = "input_id", by.y = keytype, all.x = T) %>%
       dplyr::arrange(id) %>%
       dplyr::mutate(symbol = dplyr::case_when(input_id %in% tmp2$symbol ~ input_id)) %>%
       dplyr::relocate(symbol, .after = input_id)
-
     # check if symbol in alias (only check input ids without matched)
     all_alias <- data.frame(all_alias = paste(all$ncbi_alias,all$ensembl_alias, sep = "; "))
     check_row <- which(is.na(gene_info$symbol))
@@ -60,11 +63,25 @@ genInfo <- function(id,
         gene_info = gene_info %>% dplyr::slice(-i)
       }
     }
+  }else{
+    gene_info <- merge(tmp1, tmp2, by.x = "input_id", by.y = keytype, all.x = T) %>%
+      dplyr::mutate(symbol = dplyr::case_when(input_id %in% all$symbol ~ input_id)) %>%
+      dplyr::relocate(symbol, .after = input_id)
+
+    # check if symbol in alias (only check input ids without matched)
+    all_alias <- data.frame(all_alias = paste(all$ncbi_alias, all$ensembl_alias, sep = "; "))
+    check_row <- which(is.na(gene_info$symbol))
+    for (i in check_row) {
+      alias_row <- which(stringr::str_detect(all_alias[, 1], paste0("\\b", gene_info[i, 1], "\\b")))
+      if (length(alias_row) != 0) {
+        # not match symbol but match alias
+        gene_info[i, 2:ncol(gene_info)] <- all[alias_row, ]
+      }
+    }
   }
 
   # one-to-many match
-  check_n <- table(gene_info$input_id)
-  tomany_id <- names(check_n)[check_n > 1]
+  tomany_id <- names(table(gene_info$input_id))[table(gene_info$input_id) > 1]
   tomany_id <- tomany_id[!tomany_id %in% id[duplicated(id)]]
   if (length(tomany_id) > 0 & length(tomany_id) < 3 & !unique) {
     message(paste0(
@@ -78,9 +95,8 @@ genInfo <- function(id,
     ))
   }
 
-  # if keep unique, choose row with minimum NA;
-  # if NA number is identical, then choose the smallest entrezid
-  if (unique) {
+  # if keep unique, choose row with minimum NA
+  if (unique & length(tomany_id) != 0) {
     sub = gene_info %>% dplyr::filter(input_id %in% tomany_id)
     other = gene_info %>% dplyr::filter(!input_id %in% tomany_id)
 
@@ -98,7 +114,7 @@ genInfo <- function(id,
     gene_info = rbind(other, sub[uniq_order,])
     gene_info = gene_info[match(id,gene_info$input_id),]
 
-  } else {
+  } else{
     id = factor(id,ordered = T,levels = unique(id))
     gene_info$input_id = factor(gene_info$input_id,ordered = T,levels = unique(id))
     gene_info = gene_info[order(gene_info$input_id),]

diff --git a/R/transID.R b/R/transID.R
@@ -14,7 +14,7 @@
 #' transId(
 #'   id = c("Cyp2c23", "Fhit", "Gal3st2b", "Trp53", "Tp53"),
 #'   trans_to = "ensembl", org = "mouse", unique = TRUE)
-#' # input id contains duplicates,fake id and one-to-many match id
+#' # input id contains fake id and one-to-many match id
 #' transId(
 #'   id = c("MMD2", "HBD", "RNR1", "TEC", "BCC7", "FAKEID", "TP53"),
 #'   trans_to = "entrez", org = "hg", unique = FALSE)
@@ -48,7 +48,7 @@ transId <- function(id, trans_to, org, unique = TRUE) {
   message('\n',percent, " genes are mapped from ", from, " to ", trans_to)
   if (n_new != length(id)) {
     message(paste0(
-      "Non matched ID are marked as NA",
+      "Non-matched ID are marked as NA",
       '...\nMaybe use "na.omit()" for downstream analysis'
     ))
   }

diff --git a/man/genInfo.Rd b/man/genInfo.Rd
diff --git a/man/transId.Rd b/man/transId.Rd