In [4]:
library(tidyverse)
library(ggplot2)
library(lubridate)

“package ‘tibble’ was built under R version 4.2.3”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [5]:
sessionInfo()

R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS/LAPACK: /Users/joonklaps/opt/anaconda3/lib/libopenblasp-r0.3.20.dylib

locale:
[1] C/C.UTF-8/C/C/C/C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4    
 [5] purrr_1.0.2     readr_2.1.5     tidyr_1.3.1     tibble_3.2.1   
 [9] ggplot2_3.5.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] pillar_1.10.2      compiler_4.2.1     RColorBrewer_1.1-3 base64enc_0.1-3   
 [5] tools_4.2.1        digest_0.6.36      uuid_1.2-0         timechange_0.3.0  
 [9] jsonlite_1.8.8     evaluate_1.0.3     lifecycle_1.0.4    gtable_0.3.6      
[13] pkgconfig_2.0.3    rlang_1.1.4        IRdisplay_1.1      cli_3.6.3         
[17] IRkernel_1.3.2     fastmap_1.2.0      repr_1.1.7         withr_3.0.2       
[21] g

In [6]:
trace_file <- read.csv("./human-virus/pipeline_info/execution_trace_2025-06-08_21-30-08.txt", sep="\t", na.strings = c("NA", "N/A", "null", "-",""))

In [7]:
convb <- function(x){
  ptn <- "(\\d*(.\\d+)*) (.*)"
  num  <- as.numeric(sub(ptn, "\\1", x))
  unit <- sub(ptn, "\\3", x)
  unit[unit==""] <- "1"

  mult <- c("1"=1, "KB"=1024, "MB"=1024^2, "GB"=1024^3)
  num * unname(mult[unit])
}

In [8]:
# Vectorized function to convert time strings to milliseconds
time_to_ms <- function(time_str) {
  # Handle different input formats
  time_str <- as.character(time_str)

  # Initialize total milliseconds vector
  total_ms <- rep(0, length(time_str))

  # Parse different time components using vectorized operations
  # Handle hours (e.g., "1h", "2.5h")
  hour_match <- grepl("\\d+(\\.\\d+)?h", time_str)
  if (any(hour_match)) {
    hours <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)h.*", "\\1", time_str[hour_match]))
    total_ms[hour_match] <- total_ms[hour_match] + (hours * 60 * 60 * 1000)
  }

  # Handle minutes (e.g., "30m", "20.5m")
  minute_match <- grepl("\\d+(\\.\\d+)?m", time_str)
  if (any(minute_match)) {
    minutes <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)m.*", "\\1", time_str[minute_match]))
    total_ms[minute_match] <- total_ms[minute_match] + (minutes * 60 * 1000)
  }

  # Handle seconds (e.g., "30s", "34.8s")
  second_match <- grepl("\\d+(\\.\\d+)?s", time_str)
  if (any(second_match)) {
    seconds <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)s.*", "\\1", time_str[second_match]))
    total_ms[second_match] <- total_ms[second_match] + (seconds * 1000)
  }

  # Handle milliseconds (e.g., "500ms", "0ms")
  ms_match <- grepl("\\d+(\\.\\d+)?ms", time_str)
  if (any(ms_match)) {
    ms <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)ms.*", "\\1", time_str[ms_match]))
    total_ms[ms_match] <- total_ms[ms_match] + ms
  }

  # Handle plain numbers (assume seconds)
  plain_number_match <- grepl("^\\d+(\\.\\d+)?$", time_str)
  if (any(plain_number_match)) {
    seconds <- as.numeric(time_str[plain_number_match])
    total_ms[plain_number_match] <- total_ms[plain_number_match] + (seconds * 1000)
  }

  return(total_ms)
}

# Vectorized function to convert milliseconds back to readable time string
ms_to_time <- function(ms) {
  # Vectorized version
  result <- character(length(ms))

  for (i in seq_along(ms)) {
    if (ms[i] < 1000) {
      result[i] <- paste0(ms[i], "ms")
    } else {
      # Convert to seconds, minutes, hours
      total_seconds <- ms[i] / 1000
      hours <- floor(total_seconds / 3600)
      minutes <- floor((total_seconds %% 3600) / 60)
      seconds <- total_seconds %% 60

      # Build time string
      time_parts <- c()

      if (hours > 0) {
        time_parts <- c(time_parts, paste0(hours, "h"))
      }

      if (minutes > 0) {
        time_parts <- c(time_parts, paste0(minutes, "m"))
      }

      if (seconds > 0) {
        if (seconds == floor(seconds)) {
          time_parts <- c(time_parts, paste0(seconds, "s"))
        } else {
          time_parts <- c(time_parts, paste0(round(seconds, 1), "s"))
        }
      }

      if (length(time_parts) == 0) {
        result[i] <- "0ms"
      } else {
        result[i] <- paste(time_parts, collapse = " ")
      }
    }
  }

  return(result)
}

In [9]:
# Kaiju for reads failed multiple times and skewed the results
result <- trace_file %>% filter(!grepl("FASTQ_KRAKEN_KAIJU", name)) %>%
  summarise(
    sum_cpu_time = ms_to_time(sum(time_to_ms(duration),na.rm = TRUE)),
    sum_real_time = ms_to_time(sum(time_to_ms(realtime),na.rm = TRUE)),
    peak_ram = utils:::format.object_size(max(convb(peak_rss), na.rm = TRUE), "auto"),
    peak_vmem = utils:::format.object_size(max(convb(peak_vmem),na.rm = TRUE), "auto")
  )
result

sum_cpu_time,sum_real_time,peak_ram,peak_vmem
<chr>,<chr>,<chr>,<chr>
411h 53m 21.5s,382h 56m 53.1s,79.2 Gb,85.3 Gb
