In [22]:
library(tidyverse)
library(ggplot2)
library(lubridate)
library(humanFormat)

In [2]:
R.version

               _                           
platform       x86_64-apple-darwin13.4.0   
arch           x86_64                      
os             darwin13.4.0                
system         x86_64, darwin13.4.0        
status                                     
major          4                           
minor          2.1                         
year           2022                        
month          06                          
day            23                          
svn rev        82513                       
language       R                           
version.string R version 4.2.1 (2022-06-23)
nickname       Funny-Looking Kid           

In [18]:
trace_file <- read.csv("./viralgenie/pipeline_info/execution_trace_2025-06-08_21-30-08.txt", sep="\t", na.strings = c("NA", "N/A", "null", "-",""))

In [29]:
convb <- function(x){
  ptn <- "(\\d*(.\\d+)*) (.*)"
  num  <- as.numeric(sub(ptn, "\\1", x))
  unit <- sub(ptn, "\\3", x)
  unit[unit==""] <- "1"

  mult <- c("1"=1, "KB"=1024, "MB"=1024^2, "GB"=1024^3)
  num * unname(mult[unit])
}

In [40]:
# Vectorized function to convert time strings to milliseconds
time_to_ms <- function(time_str) {
  # Handle different input formats
  time_str <- as.character(time_str)

  # Initialize total milliseconds vector
  total_ms <- rep(0, length(time_str))

  # Parse different time components using vectorized operations
  # Handle hours (e.g., "1h", "2.5h")
  hour_match <- grepl("\\d+(\\.\\d+)?h", time_str)
  if (any(hour_match)) {
    hours <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)h.*", "\\1", time_str[hour_match]))
    total_ms[hour_match] <- total_ms[hour_match] + (hours * 60 * 60 * 1000)
  }

  # Handle minutes (e.g., "30m", "20.5m")
  minute_match <- grepl("\\d+(\\.\\d+)?m", time_str)
  if (any(minute_match)) {
    minutes <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)m.*", "\\1", time_str[minute_match]))
    total_ms[minute_match] <- total_ms[minute_match] + (minutes * 60 * 1000)
  }

  # Handle seconds (e.g., "30s", "34.8s")
  second_match <- grepl("\\d+(\\.\\d+)?s", time_str)
  if (any(second_match)) {
    seconds <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)s.*", "\\1", time_str[second_match]))
    total_ms[second_match] <- total_ms[second_match] + (seconds * 1000)
  }

  # Handle milliseconds (e.g., "500ms", "0ms")
  ms_match <- grepl("\\d+(\\.\\d+)?ms", time_str)
  if (any(ms_match)) {
    ms <- as.numeric(gsub(".*?(\\d+(?:\\.\\d+)?)ms.*", "\\1", time_str[ms_match]))
    total_ms[ms_match] <- total_ms[ms_match] + ms
  }

  # Handle plain numbers (assume seconds)
  plain_number_match <- grepl("^\\d+(\\.\\d+)?$", time_str)
  if (any(plain_number_match)) {
    seconds <- as.numeric(time_str[plain_number_match])
    total_ms[plain_number_match] <- total_ms[plain_number_match] + (seconds * 1000)
  }

  return(total_ms)
}

# Vectorized function to convert milliseconds back to readable time string
ms_to_time <- function(ms) {
  # Vectorized version
  result <- character(length(ms))

  for (i in seq_along(ms)) {
    if (ms[i] < 1000) {
      result[i] <- paste0(ms[i], "ms")
    } else {
      # Convert to seconds, minutes, hours
      total_seconds <- ms[i] / 1000
      hours <- floor(total_seconds / 3600)
      minutes <- floor((total_seconds %% 3600) / 60)
      seconds <- total_seconds %% 60

      # Build time string
      time_parts <- c()

      if (hours > 0) {
        time_parts <- c(time_parts, paste0(hours, "h"))
      }

      if (minutes > 0) {
        time_parts <- c(time_parts, paste0(minutes, "m"))
      }

      if (seconds > 0) {
        if (seconds == floor(seconds)) {
          time_parts <- c(time_parts, paste0(seconds, "s"))
        } else {
          time_parts <- c(time_parts, paste0(round(seconds, 1), "s"))
        }
      }

      if (length(time_parts) == 0) {
        result[i] <- "0ms"
      } else {
        result[i] <- paste(time_parts, collapse = " ")
      }
    }
  }

  return(result)
}

In [46]:
# Kaiju for reads failed multiple times and skewed the results
result <- trace_file %>% filter(!grepl("FASTQ_KRAKEN_KAIJU", name)) %>%
  summarise(
    sum_cpu_time = ms_to_time(sum(time_to_ms(duration),na.rm = TRUE)),
    sum_real_time = ms_to_time(sum(time_to_ms(realtime),na.rm = TRUE)),
    peak_ram = formatIECBytes(max(convb(peak_rss), na.rm = TRUE)),
    peak_vmem = formatIECBytes(max(convb(peak_vmem),na.rm = TRUE))
  )
result

sum_cpu_time,sum_real_time,peak_ram,peak_vmem
<chr>,<chr>,<chr>,<chr>
411h 53m 21.5s,382h 56m 53.1s,79.20 GiB,85.30 GiB
