adds rank_order_data

and makes the inputs across the three functions all match
MoseleyBioinformaticsLab · Apr 12, 2024 · d1533d9 · d1533d9
1 parent 4158d89
commit d1533d9
Show file tree

Hide file tree

Showing 46 changed files with 494 additions and 79 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ICIKendallTau
 Title: Calculates information-content-informed Kendall-tau
-Version: 1.1.1
+Version: 1.1.2
 Authors@R: c(
     person(
       given = c("Robert", "M"), 
@@ -37,7 +37,8 @@ Suggests: furrr,
           dplyr, 
           logger, 
           withr,
-          naniar
+          ggplot2,
+          visdat
 URL: https://moseleybioinformaticslab.github.io/ICIKendallTau/
         https://github.com/moseleybioinformaticslab/ICIKendallTau/
 BugReports:

diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(log_memory)
 export(log_message)
 export(long_df_2_cor_matrix)
 export(pairwise_completeness)
+export(rank_order_data)
 export(show_progress)
 export(test_left_censorship)
 importFrom(Rcpp,sourceCpp)

diff --git a/R/left_censorship.R b/R/left_censorship.R
@@ -4,7 +4,7 @@
 #' is due to values being below the limit of detection, or coming from a 
 #' left-censored distribution.
 #' 
-#' @param in_data matrix or data.frame of numeric data
+#' @param data_matrix matrix or data.frame of numeric data
 #' @param sample_classes which samples are in which class
 #' @param global_na what represents zero or missing?
 #' 
@@ -30,25 +30,28 @@
 #' 
 #' @export
 #' @return data.frame of trials / successes, and binom.test result
-test_left_censorship = function(in_data, 
+test_left_censorship = function(data_matrix, 
                                 sample_classes = NULL, 
                                 global_na = c(NA, Inf, 0))
 {
+  if (inherits(data_matrix, "data.frame")) {
+    data_matrix = as.matrix(data_matrix)
+  }
   if (is.null(sample_classes)) {
-    sample_classes = rep("A", ncol(in_data))
+    sample_classes = rep("A", ncol(data_matrix))
   }
 
-  split_indices = split(seq_len(ncol(in_data)), sample_classes)
-  missing_loc = setup_missing_matrix(in_data, global_na)
-  in_data_missing = in_data
-  in_data_missing[missing_loc] = NA
+  split_indices = split(seq_len(ncol(data_matrix)), sample_classes)
+  missing_loc = setup_missing_matrix(data_matrix, global_na)
+  data_matrix_missing = data_matrix
+  data_matrix_missing[missing_loc] = NA
 
   # split the dataset by group
   split_counts = purrr::imap(split_indices, \(in_split, split_id){
     # in_split = split_indices[[1]]
 
     # grab the group we want to work with
-    split_missing = in_data_missing[, in_split, drop = FALSE]
+    split_missing = data_matrix_missing[, in_split, drop = FALSE]
 
     # count the number of missing samples for each feature,
     # and keep those that have at least one

diff --git a/R/rank-ordering.R b/R/rank-ordering.R
@@ -1,5 +1,21 @@
+#' Rank order row data
+#' 
+#' Given a data-matrix of numeric data, calculates the rank of each row in each
+#' column (feature in sample), gets the median rank across all columns, and
+#' returns the original data with missing values set to NA, the reordered data,
+#' and a data.frame of the ranks of each feature and the number of missing values.
+#'
+#' @param data_matrix matrix or data.frame of values
+#' @param global_na the values to consider as missing
+#' 
+#' @export
+#' 
+#' @returns list with two matrices and a data.frame
 rank_order_data = function(data_matrix, global_na = c(NA, Inf, 0))
 {
+  if (inherits(data_matrix, "data.frame")) {
+    data_matrix = as.matrix(data_matrix)
+  }
   missing_loc = setup_missing_matrix(data_matrix, global_na)
   data_matrix_na = data_matrix
   data_matrix_na[missing_loc] = NA
@@ -8,12 +24,16 @@ rank_order_data = function(data_matrix, global_na = c(NA, Inf, 0))
 
   })
   sample_ranks = do.call(cbind, sample_ranks)
-  median_ranks = apply(sample_ranks, 1, median)
-  rank_order = order(median_ranks, decreasing = TRUE)
+  median_rank = apply(sample_ranks, 1, median)
+
+  n_na = rowSums(is.na(data_matrix_na))
+  rank_order = order(median_rank, decreasing = TRUE)
 
   perc_missing = colSums(is.na(data_matrix_na)) / nrow(data_matrix_na)
   perc_order = order(perc_missing, decreasing = TRUE)
 
-  return(original = data_matrix_na,
-         ordered = data_matrix_na[rank_order, perc_order])
+  return(list(original = data_matrix_na,
+              ordered = data_matrix_na[rank_order, perc_order],
+              n_na_rank = data.frame(n_na = n_na,
+                                     median_rank = median_rank)))
 }
diff --git a/README.Rmd b/README.Rmd
@@ -66,6 +66,7 @@ The functions that implement this include:
   * Otherwise will only use a single core.
 
 We've also included a function for testing if the missingness in your data comes from left-censorship, `test_left_censorship`. We walk through creating example data and testing it in the vignette [Testing for Left Censorship](https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/testing-for-left-censorship).
+In addition to testing, you can also visualize the missing data pattern by feature rank using the `rank_order_data` function, and use `visdat::vis_miss()` on the original and reordered missing data.
 
 ## Examples
 

diff --git a/README.html b/README.html
@@ -606,7 +606,7 @@
 <h1 id="icikendalltau">ICIKendallTau</h1>
 <!-- badges: start -->
 
-<p><a href="https://moseleybioinformaticslab.r-universe.dev"><img src="data:image/svg+xml; charset=utf-8;base64,PHN2ZyB3aWR0aD0iMTA1LjgiIGhlaWdodD0iMjAiIHZpZXdCb3g9IjAgMCAxMDU4IDIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiByb2xlPSJpbWciIGFyaWEtbGFiZWw9InItdW5pdmVyc2U6IDEuMC41Ij4KICA8YSBocmVmPSJodHRwczovL21vc2VsZXliaW9pbmZvcm1hdGljc2xhYi5yLXVuaXZlcnNlLmRldi9JQ0lLZW5kYWxsVGF1IiBhbHQ9InItdW5pdmVyc2UiPgogIDx0aXRsZT5yLXVuaXZlcnNlOiAxLjAuNTwvdGl0bGU+CiAgPGxpbmVhckdyYWRpZW50IGlkPSJlVG1LdiIgeDI9IjAiIHkyPSIxMDAlIj4KICAgIDxzdG9wIG9mZnNldD0iMCIgc3RvcC1vcGFjaXR5PSIuMSIgc3RvcC1jb2xvcj0iI0VFRSIvPgogICAgPHN0b3Agb2Zmc2V0PSIxIiBzdG9wLW9wYWNpdHk9Ii4xIi8+CiAgPC9saW5lYXJHcmFkaWVudD4KICA8bWFzayBpZD0iZlZlWkkiPjxyZWN0IHdpZHRoPSIxMDU4IiBoZWlnaHQ9IjIwMCIgcng9IjMwIiBmaWxsPSIjRkZGIi8+PC9tYXNrPgogIDxnIG1hc2s9InVybCgjZlZlWkkpIj4KICAgIDxyZWN0IHdpZHRoPSI2NjgiIGhlaWdodD0iMjAwIiBmaWxsPSIjNTU1Ii8+CiAgICA8cmVjdCB3aWR0aD0iMzkwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iIzNDMSIgeD0iNjY4Ii8+CiAgICA8cmVjdCB3aWR0aD0iMTA1OCIgaGVpZ2h0PSIyMDAiIGZpbGw9InVybCgjZVRtS3YpIi8+CiAgPC9nPgogIDxnIGFyaWEtaGlkZGVuPSJ0cnVlIiBmaWxsPSIjZmZmIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIGZvbnQtZmFtaWx5PSJWZXJkYW5hLERlamFWdSBTYW5zLHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEwIj4KICAgIDx0ZXh0IHg9IjYwIiB5PSIxNDgiIHRleHRMZW5ndGg9IjU2OCIgZmlsbD0iIzAwMCIgb3BhY2l0eT0iMC4yNSI+ci11bml2ZXJzZTwvdGV4dD4KICAgIDx0ZXh0IHg9IjUwIiB5PSIxMzgiIHRleHRMZW5ndGg9IjU2OCI+ci11bml2ZXJzZTwvdGV4dD4KICAgIDx0ZXh0IHg9IjcyMyIgeT0iMTQ4IiB0ZXh0TGVuZ3RoPSIyOTAiIGZpbGw9IiMwMDAiIG9wYWNpdHk9IjAuMjUiPjEuMC41PC90ZXh0PgogICAgPHRleHQgeD0iNzEzIiB5PSIxMzgiIHRleHRMZW5ndGg9IjI5MCI+MS4wLjU8L3RleHQ+CiAgPC9nPgogIAogIDwvYT4KPC9zdmc+" alt="ICIKendallTau status badge" /></a></p>
+<p><a href="https://moseleybioinformaticslab.r-universe.dev"><img src="data:image/svg+xml; charset=utf-8;base64,PHN2ZyB3aWR0aD0iMTA1LjgiIGhlaWdodD0iMjAiIHZpZXdCb3g9IjAgMCAxMDU4IDIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiByb2xlPSJpbWciIGFyaWEtbGFiZWw9InItdW5pdmVyc2U6IDEuMS4wIj4KICA8YSBocmVmPSJodHRwczovL21vc2VsZXliaW9pbmZvcm1hdGljc2xhYi5yLXVuaXZlcnNlLmRldi9JQ0lLZW5kYWxsVGF1IiBhbHQ9InItdW5pdmVyc2UiPgogIDx0aXRsZT5yLXVuaXZlcnNlOiAxLjEuMDwvdGl0bGU+CiAgPGxpbmVhckdyYWRpZW50IGlkPSJQY0NyTyIgeDI9IjAiIHkyPSIxMDAlIj4KICAgIDxzdG9wIG9mZnNldD0iMCIgc3RvcC1vcGFjaXR5PSIuMSIgc3RvcC1jb2xvcj0iI0VFRSIvPgogICAgPHN0b3Agb2Zmc2V0PSIxIiBzdG9wLW9wYWNpdHk9Ii4xIi8+CiAgPC9saW5lYXJHcmFkaWVudD4KICA8bWFzayBpZD0iVU9zZnIiPjxyZWN0IHdpZHRoPSIxMDU4IiBoZWlnaHQ9IjIwMCIgcng9IjMwIiBmaWxsPSIjRkZGIi8+PC9tYXNrPgogIDxnIG1hc2s9InVybCgjVU9zZnIpIj4KICAgIDxyZWN0IHdpZHRoPSI2NjgiIGhlaWdodD0iMjAwIiBmaWxsPSIjNTU1Ii8+CiAgICA8cmVjdCB3aWR0aD0iMzkwIiBoZWlnaHQ9IjIwMCIgZmlsbD0iIzNDMSIgeD0iNjY4Ii8+CiAgICA8cmVjdCB3aWR0aD0iMTA1OCIgaGVpZ2h0PSIyMDAiIGZpbGw9InVybCgjUGNDck8pIi8+CiAgPC9nPgogIDxnIGFyaWEtaGlkZGVuPSJ0cnVlIiBmaWxsPSIjZmZmIiB0ZXh0LWFuY2hvcj0ic3RhcnQiIGZvbnQtZmFtaWx5PSJWZXJkYW5hLERlamFWdSBTYW5zLHNhbnMtc2VyaWYiIGZvbnQtc2l6ZT0iMTEwIj4KICAgIDx0ZXh0IHg9IjYwIiB5PSIxNDgiIHRleHRMZW5ndGg9IjU2OCIgZmlsbD0iIzAwMCIgb3BhY2l0eT0iMC4yNSI+ci11bml2ZXJzZTwvdGV4dD4KICAgIDx0ZXh0IHg9IjUwIiB5PSIxMzgiIHRleHRMZW5ndGg9IjU2OCI+ci11bml2ZXJzZTwvdGV4dD4KICAgIDx0ZXh0IHg9IjcyMyIgeT0iMTQ4IiB0ZXh0TGVuZ3RoPSIyOTAiIGZpbGw9IiMwMDAiIG9wYWNpdHk9IjAuMjUiPjEuMS4wPC90ZXh0PgogICAgPHRleHQgeD0iNzEzIiB5PSIxMzgiIHRleHRMZW5ndGg9IjI5MCI+MS4xLjA8L3RleHQ+CiAgPC9nPgogIAogIDwvYT4KPC9zdmc+" alt="ICIKendallTau status badge" /></a></p>
 <!-- badges: end -->
 
 <p>You can see the pkgdown site <a href="https://moseleybioinformaticslab.github.io/ICIKendallTau/">here</a>.</p>
@@ -665,7 +665,11 @@ <h2 id="package-functions">Package Functions</h2>
 <p>We’ve also included a function for testing if the missingness in your
 data comes from left-censorship, <code>test_left_censorship</code>. We
 walk through creating example data and testing it in the vignette <a href="https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/testing-for-left-censorship">Testing
-for Left Censorship</a>.</p>
+for Left Censorship</a>. In addition to testing, you can also visualize
+the missing data pattern by feature rank using the
+<code>rank_order_data</code> function, and use
+<code>visdat::vis_miss()</code> on the original and reordered missing
+data.</p>
 <h2 id="examples">Examples</h2>
 <p>The most common case is a large matrix of independent samples
 (columns) and measured features in each of the samples (i.e. gene
@@ -725,10 +729,10 @@ <h2 id="is-it-fast">Is It Fast?</h2>
 <span id="cb5-14"><a href="#cb5-14" tabindex="-1"></a>  <span class="at">times =</span> <span class="dv">5</span></span>
 <span id="cb5-15"><a href="#cb5-15" tabindex="-1"></a>)</span>
 <span id="cb5-16"><a href="#cb5-16" tabindex="-1"></a><span class="co">#&gt; Unit: microseconds</span></span>
-<span id="cb5-17"><a href="#cb5-17" tabindex="-1"></a><span class="co">#&gt;                           expr       min        lq       mean    median        uq       max neval</span></span>
-<span id="cb5-18"><a href="#cb5-18" tabindex="-1"></a><span class="co">#&gt;  cor(x, y, method = &quot;kendall&quot;) 11685.244 12730.878 12860.9060 13071.630 13406.514 13410.264     5</span></span>
-<span id="cb5-19"><a href="#cb5-19" tabindex="-1"></a><span class="co">#&gt;         ici_kt(x, y, &quot;global&quot;)   263.306   268.503   332.1288   274.858   283.589   570.388     5</span></span>
-<span id="cb5-20"><a href="#cb5-20" tabindex="-1"></a><span class="co">#&gt;       ici_kt(x2, y2, &quot;global&quot;) 14110.743 14322.836 15782.6490 16053.907 16595.979 17829.780     5</span></span></code></pre></div>
+<span id="cb5-17"><a href="#cb5-17" tabindex="-1"></a><span class="co">#&gt;                           expr       min        lq       mean    median       uq       max neval</span></span>
+<span id="cb5-18"><a href="#cb5-18" tabindex="-1"></a><span class="co">#&gt;  cor(x, y, method = &quot;kendall&quot;) 11666.371 11671.671 12405.0886 12084.801 13276.95 13325.649     5</span></span>
+<span id="cb5-19"><a href="#cb5-19" tabindex="-1"></a><span class="co">#&gt;         ici_kt(x, y, &quot;global&quot;)   253.826   255.717   430.9002   277.985   317.24  1049.733     5</span></span>
+<span id="cb5-20"><a href="#cb5-20" tabindex="-1"></a><span class="co">#&gt;       ici_kt(x2, y2, &quot;global&quot;) 13405.302 13731.770 15208.7108 14693.928 15415.66 18796.894     5</span></span></code></pre></div>
 <p>In the case of 40,000 features, the average time on a modern CPU is
 14 milliseconds.</p>
 <p>Of course, if you want to use it to calculate Kendall-tau-b without
@@ -770,7 +774,7 @@ <h2 id="many-many-comparisons">Many Many Comparisons</h2>
 <span id="cb9-7"><a href="#cb9-7" tabindex="-1"></a><span class="co">#&gt; 3 s4 s4    0 1.0000000      0 1.000000 1.0000000</span></span>
 <span id="cb9-8"><a href="#cb9-8" tabindex="-1"></a><span class="co">#&gt; </span></span>
 <span id="cb9-9"><a href="#cb9-9" tabindex="-1"></a><span class="co">#&gt; $run_time</span></span>
-<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a><span class="co">#&gt; [1] 0.01747489</span></span></code></pre></div>
+<span id="cb9-10"><a href="#cb9-10" tabindex="-1"></a><span class="co">#&gt; [1] 0.01783729</span></span></code></pre></div>
 <h2 id="code-of-conduct">Code of Conduct</h2>
 <p>Please note that the ICIKendallTau project is released with a <a href="https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html">Contributor
 Code of Conduct</a>. By contributing to this project, you agree to abide

diff --git a/README.md b/README.md
@@ -71,6 +71,9 @@ We’ve also included a function for testing if the missingness in your
 data comes from left-censorship, `test_left_censorship`. We walk through
 creating example data and testing it in the vignette [Testing for Left
 Censorship](https://moseleybioinformaticslab.github.io/ICIKendallTau/articles/testing-for-left-censorship).
+In addition to testing, you can also visualize the missing data pattern
+by feature rank using the `rank_order_data` function, and use
+`visdat::vis_miss()` on the original and reordered missing data.
 
 ## Examples
 
@@ -142,10 +145,10 @@ microbenchmark(
   times = 5
 )
 #> Unit: microseconds
-#>                           expr       min        lq       mean    median        uq       max neval
-#>  cor(x, y, method = "kendall") 11685.244 12730.878 12860.9060 13071.630 13406.514 13410.264     5
-#>         ici_kt(x, y, "global")   263.306   268.503   332.1288   274.858   283.589   570.388     5
-#>       ici_kt(x2, y2, "global") 14110.743 14322.836 15782.6490 16053.907 16595.979 17829.780     5
+#>                           expr       min        lq       mean    median       uq       max neval
+#>  cor(x, y, method = "kendall") 11666.371 11671.671 12405.0886 12084.801 13276.95 13325.649     5
+#>         ici_kt(x, y, "global")   253.826   255.717   430.9002   277.985   317.24  1049.733     5
+#>       ici_kt(x2, y2, "global") 13405.302 13731.770 15208.7108 14693.928 15415.66 18796.894     5
 ```
 
 In the case of 40,000 features, the average time on a modern CPU is 14
@@ -204,7 +207,7 @@ r_4
 #> 3 s4 s4    0 1.0000000      0 1.000000 1.0000000
 #> 
 #> $run_time
-#> [1] 0.01747489
+#> [1] 0.01783729
 ```
 
 ## Code of Conduct

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
diff --git a/docs/LICENSE.html b/docs/LICENSE.html
diff --git a/docs/articles/ici-kendalltau.html b/docs/articles/ici-kendalltau.html
diff --git a/docs/articles/index.html b/docs/articles/index.html