# Imports

In [None]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.patches as mpatches

# Constants

In [None]:
SCALING_FACTOR = 1024
METHODS = [
    "all",
    "centroid",
    "ggrasp",
    "vlq",
    "meshclust_0.95",
    "meshclust_0.99",
    "gclust_0.95",
    "gclust_0.99",
    "gclust_0.999",
    "vsearch_0.95",
    "vsearch_0.99",
    "vsearch_0.999",
    "single-linkage_1",
    "single-linkage_5",
    "single-linkage_10",
    "single-linkage_25",
    "single-linkage_50",
    "single-linkage_90",
    "single-linkage_99",
    "complete-linkage_1",
    "complete-linkage_5",
    "complete-linkage_10",
    "complete-linkage_25",
    "complete-linkage_50",
    "complete-linkage_90",
    "complete-linkage_99",
]
clustering_methods = [
    "centroid",
    "ggrasp",
    "single-linkage_1",
    "single-linkage_5",
    "single-linkage_10",
    "single-linkage_25",
    "single-linkage_50",
    "single-linkage_90",
    "single-linkage_99",
    "complete-linkage_1",
    "complete-linkage_5",
    "complete-linkage_10",
    "complete-linkage_25",
    "complete-linkage_50",
    "complete-linkage_90",
    "complete-linkage_99",
]
meshclust_methods = [
    "meshclust_0.95",
    "meshclust_0.99",
]
greedy_methods = [
    "gclust_0.95",
    "gclust_0.99",
    "gclust_0.999",
    "vsearch_0.95",
    "vsearch_0.99",
    "vsearch_0.999",
]
ABUNDANCES = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
SEEDS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
OUTPUT_DIR = "figures/viruses"

# Selection
**NOTE**: The runtime and memory usage for the selection stage of this study were not retrievable. Instead we provide below the runtimes and memory usage as obtained through SLURM's `seff` command.

## Global


In [None]:
# MASH
mash_sketch = {"runtime": 35*60 + 1, "memory": 43.46/SCALING_FACTOR}                            #10672665 (16 cores), [runtime in seconds, peak memory]
mash_dist = {"runtime": (1*60 + 25)*60 + 39, "memory": 43.68/SCALING_FACTOR}                    #10672680 (16 cores), [runtime in seconds, peak memory]

global_ = {}

# Centroid
global_["centroid"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(22), "memory": max(mash_sketch["memory"], mash_dist["memory"], 33.45/SCALING_FACTOR) } #10673966 (16 cores), [runtime in seconds, peak memory]
# Clustering
clustering = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# GGRaSP
global_["ggrasp"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(((1*24 + 23)*60 + 43)*60 + 9), "memory": max(mash_sketch["memory"], mash_dist["memory"], 969.29/SCALING_FACTOR)} #10676627 (64 cores)
# VLQ
global_["vlq"] = {"runtime": ((4*60 + 29)*60 + 28), "memory": 10.44} #10884144 (20 cores) - output is already in GB
# MeShClust - 0.95
global_["meshclust_0.95"] = {"runtime": (((2*24 + 6)*60 + 38)*60 + 37), "memory": 240.72/SCALING_FACTOR} #10674273 (64 cores)
# MeShClust - 0.99
global_["meshclust_0.99"] = {"runtime": (((2*24 + 10)*60 + 36)*60 + 51), "memory": 254.50/SCALING_FACTOR} #10674568 (64 cores)
# Gclust - 0.95
global_["gclust_0.95"] = {"runtime": ((15*60 + 40)*60 + 36), "memory": 358.68/SCALING_FACTOR} #10673980 (64 cores)
# Gclust - 0.99
global_["gclust_0.99"] = {"runtime": ((12*60 + 21)*60 + 13), "memory": 372.25/SCALING_FACTOR} #10674104 (64 cores)
# Gclust - 0.999
global_["gclust_0.999"] = {"runtime": ((12*60 + 23)*60 + 14), "memory": 319.07/SCALING_FACTOR} #10674236 (64 cores)
# VSEARCH - 0.95
global_["vsearch_0.95"] = {"runtime": (((8*24 + 11)*60 + 49)*60 + 34), "memory": 221.03/SCALING_FACTOR} #10676635 (64 cores)
# VSEARCH - 0.99
global_["vsearch_0.99"] = {"runtime": (((12*24 + 13)*60 + 18)*60 + 19), "memory": 225.24/SCALING_FACTOR} #10686153 (64 cores)
# VSEARCH - 0.999
global_["vsearch_0.999"] = {"runtime": (((15*24 + 11)*60 + 25)*60 + 54), "memory": 222.55/SCALING_FACTOR} #10686343 (64 cores)
# SL - 1
global_["single-linkage_1"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# SL - 5
global_["single-linkage_5"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# SL - 10
global_["single-linkage_10"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# SL - 25
global_["single-linkage_25"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# SL - 50
global_["single-linkage_50"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# SL - 90
global_["single-linkage_90"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# SL - 99
global_["single-linkage_99"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 1
global_["complete-linkage_1"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 5
global_["complete-linkage_5"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 10
global_["complete-linkage_10"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 25
global_["complete-linkage_25"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 50
global_["complete-linkage_50"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 90
global_["complete-linkage_90"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)
# CL - 99
global_["complete-linkage_99"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(31*60 + 12), "memory": max(mash_sketch["memory"], mash_dist["memory"], 291.58/SCALING_FACTOR)} #10687086 (64 cores)

## Country

In [None]:
# MASH
mash_sketch = {"runtime": 11*60 + 26, "memory": 3.05/SCALING_FACTOR} #10888921 (16 cores), [runtime in seconds, peak memory]
mash_dist = {"runtime": 25*60 + 1, "memory": 44.93/SCALING_FACTOR} #10888924 (16 cores), [runtime in seconds, peak memory]

country = {}

# Centroid
country["centroid"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(10), "memory": max(mash_sketch["memory"], mash_dist["memory"], 28.24/SCALING_FACTOR) } #10889079 (16 cores), [runtime in seconds, peak memory]
# GGRaSP
country["ggrasp"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+((11*60 + 39)*60 + 38), "memory": max(mash_sketch["memory"], mash_dist["memory"], 946.58/SCALING_FACTOR)} #10889220 (64 cores)
# VLQ
country["vlq"] = {"runtime": ((1*60 + 45)*60 + 53), "memory": 10.25} #10889250 (20 cores)
# MeShClust - 0.95
country["meshclust_0.95"] = {"runtime": ((11*60 + 38)*60 + 5), "memory": 226.20/SCALING_FACTOR} #10889212 (64 cores)
# MeShClust - 0.99
country["meshclust_0.99"] = {"runtime": ((15*60 + 37)*60 + 41), "memory": 228.77/SCALING_FACTOR} #10889215 (64 cores)
# Gclust - 0.95
country["gclust_0.95"] = {"runtime": ((2*60 + 12)*60 + 44) + ((4*60 + 1)*60 + 43), "memory": max(319.54/SCALING_FACTOR, 287.94/SCALING_FACTOR)} #10889150+10889363 (64 cores)
# Gclust - 0.99
country["gclust_0.99"] = {"runtime": ((2*60 + 13)*60 + 52) + ((2*60 + 15) + 51), "memory": max(327.22/SCALING_FACTOR, 327.78/SCALING_FACTOR)} #10889151+10889365 (64 cores)
# Gclust - 0.999
country["gclust_0.999"] = {"runtime": ((2*60 + 11)*60 + 3) + ((2*60 + 16)*60 + 12), "memory": max(291.93/SCALING_FACTOR, 311.23/SCALING_FACTOR)} #10889152+10889366 (64 cores)
# VSEARCH - 0.95
country["vsearch_0.95"] = {"runtime": (((4*24 + 2)*60 + 42)*60 + 4) + (11*60 + 44), "memory": max(220.59/SCALING_FACTOR, 162.04/SCALING_FACTOR)} #10889199+10890059 (64 cores)
# VSEARCH - 0.99
country["vsearch_0.99"] = {"runtime": (((4*24 + 5)*60 + 41)*60 + 52) + (9) + (14), "memory": max(218.30/SCALING_FACTOR, 0/SCALING_FACTOR, 0/SCALING_FACTOR)} #10889200+10890060+10890116 (64 cores)
# VSEARCH - 0.999
country["vsearch_0.999"] = {"runtime": (((4*24 + 9)*60 + 21)*60 + 38) + (1*60 + 9) + (1), "memory": max(219.57/SCALING_FACTOR, 50.36/SCALING_FACTOR, 0/SCALING_FACTOR)} #10889201+10890061+10890117 (64 cores)
# SL - 1
country["single-linkage_1"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# SL - 5
country["single-linkage_5"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# SL - 10
country["single-linkage_10"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# SL - 25
country["single-linkage_25"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# SL - 50
country["single-linkage_50"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# SL - 90
country["single-linkage_90"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# SL - 99
country["single-linkage_99"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 1
country["complete-linkage_1"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 5
country["complete-linkage_5"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 10
country["complete-linkage_10"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 25
country["complete-linkage_25"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 50
country["complete-linkage_50"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 90
country["complete-linkage_90"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)
# CL - 99
country["complete-linkage_99"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(9*60 + 7), "memory": max(mash_sketch["memory"], mash_dist["memory"], 249.36/SCALING_FACTOR)} #10889114 (64 cores)

## State

In [None]:
# MASH
mash_sketch = {"runtime": 1*60 + 9, "memory": 0/SCALING_FACTOR} #10888920 (16 cores), [runtime in seconds, peak memory]
mash_dist = {"runtime": 1*60 + 6, "memory": 0/SCALING_FACTOR} #10888923 (16 cores), [runtime in seconds, peak memory]

state = {}

# Centroid
state["centroid"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(2), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR) } #10889078 (16 cores), [runtime in seconds, peak memory]
# GGRaSP
state["ggrasp"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(26*60 + 23), "memory": max(mash_sketch["memory"], mash_dist["memory"], 283.36/SCALING_FACTOR)} #10889218 (64 cores)
# VLQ
state["vlq"] = {"runtime": (33*60 + 7), "memory": 9.91} #10889249 (20 cores)
# MeShClust - 0.95
state["meshclust_0.95"] = {"runtime": ((1*60 + 19)*60 + 55), "memory": 171.94/SCALING_FACTOR} #10889211 (64 cores)
# MeShClust - 0.99
state["meshclust_0.99"] = {"runtime": ((1*60 + 18)*60 + 18), "memory": 138.39/SCALING_FACTOR} #10889210 (64 cores)
# Gclust - 0.95
state["gclust_0.95"] = {"runtime": (11*60 + 20) + (10*60 + 59), "memory": max(12.89/SCALING_FACTOR, 22.83/SCALING_FACTOR)} #10889138+10889260 (64 cores)
# Gclust - 0.99
state["gclust_0.99"] = {"runtime": (9*60 + 47) + (5*60 + 40), "memory": max(55.88/SCALING_FACTOR, 32.36/SCALING_FACTOR)} #10889148+10889261 (64 cores)
# Gclust - 0.999
state["gclust_0.999"] = {"runtime": (5*60 + 44) + (5*60 + 46), "memory": max(23.32/SCALING_FACTOR, 29.58/SCALING_FACTOR)} #10889149+10889262 (64 cores)
# VSEARCH - 0.95
state["vsearch_0.95"] = {"runtime": ((4*60 + 13)*60 + 3) + (0), "memory": max(196.14/SCALING_FACTOR, 0/SCALING_FACTOR)} #10889188+10890056 (64 cores)
# VSEARCH - 0.99
state["vsearch_0.99"] = {"runtime": ((4*60 + 12)*60 + 40) + (0), "memory": max(193.30/SCALING_FACTOR, 0/SCALING_FACTOR)} #10889197+10890057 (64 cores)
# VSEARCH - 0.999
state["vsearch_0.999"] = {"runtime": ((4*60 + 11)*60 + 41) + (0), "memory": max(198.60/SCALING_FACTOR, 0/SCALING_FACTOR)} #10889198+10890058 (64 cores)
# SL - 1
state["single-linkage_1"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# SL - 5
state["single-linkage_5"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# SL - 10
state["single-linkage_10"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# SL - 25
state["single-linkage_25"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# SL - 50
state["single-linkage_50"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# SL - 90
state["single-linkage_90"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# SL - 99
state["single-linkage_99"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 1
state["complete-linkage_1"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 5
state["complete-linkage_5"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 10
state["complete-linkage_10"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 25
state["complete-linkage_25"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 50
state["complete-linkage_50"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 90
state["complete-linkage_90"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)
# CL - 99
state["complete-linkage_99"] = {"runtime": mash_sketch["runtime"]+mash_dist["runtime"]+(11), "memory": max(mash_sketch["memory"], mash_dist["memory"], 0/SCALING_FACTOR)} #10889113 (64 cores)


# Indexing
**NOTE**: here we assume that kallisto was run, and timed using the `/usr/bin/time -v` command, saving runtimes collectively in `root/runtimes/indexing_${experiment}_${method}_${threshold}`!

In [None]:
def read_logs_global(method):
    with open(f"root/runtimes/indexing_global_{method}", "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {"user time": 0, "system time": 0, "memory": 0}
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line: #note that this is in kbytes! -> divide by 1024^2
                data["memory"] = int(line.strip().split()[-1])/(SCALING_FACTOR**2)
            elif "Exit status" in line:
                exit_status = int(line.strip().split()[-1])
                if exit_status == 0:
                    return data

def read_logs_country(method):
    with open(f"root/runtimes/indexing_country_{method}", "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {"user time": 0, "system time": 0, "memory": 0}
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line: #note that this is in kbytes! -> divide by 1024^2
                data["memory"] = int(line.strip().split()[-1])/(1024**2)
            elif "Exit status" in line:
                exit_status = int(line.strip().split()[-1])
                if exit_status == 0:
                    return data

def read_logs_state(method):
    with open(f"root/runtimes/indexing_state_{method}", "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {"user time": 0, "system time": 0, "memory": 0}
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line: #note that this is in kbytes! -> divide by 1024^2
                data["memory"] = int(line.strip().split()[-1])/(1024**2)
            elif "Exit status" in line:
                exit_status = int(line.strip().split()[-1])
                if exit_status == 0:
                    return data

for method in METHODS[1:]:
    cur = read_logs_global(method)
    global_[method]["time"] = global_[method]["runtime"] + cur["user time"] + cur["system time"]
    global_[method]["memory"] = max(global_[method]["memory"], cur["memory"])
    cur = read_logs_country(method)
    country[method]["time"] = country[method]["runtime"] + cur["user time"] + cur["system time"]
    country[method]["memory"] = max(country[method]["memory"], cur["memory"])
    cur = read_logs_state(method)
    state[method]["time"] = state[method]["runtime"] + cur["user time"] + cur["system time"]
    state[method]["memory"] = max(state[method]["memory"], cur["memory"])

all = read_logs_global("all")
global_["all"] = {"time": all["user time"] + all["system time"], "memory": all["memory"]}
all = read_logs_country("all")
country["all"] = {"time": all["user time"] + all["system time"], "memory": all["memory"]}
all = read_logs_state("all")
state["all"] = {"time": all["user time"] + all["system time"], "memory": all["memory"]}


# Profiling

In [None]:
def read_runtimes(method, experiment, vlq=False):
    basepath = f"/tudelft.net/staff-umbrella/refsetopt/manuscript/SARS-CoV-2/tmp_kallisto_indices/estimation_results_15-08-2025/{experiment}/{method}/runtimes"
    memories = np.zeros((len(ABUNDANCES), len(SEEDS)), dtype=np.float64)
    times = np.zeros((len(ABUNDANCES), len(SEEDS)), dtype=np.float64)
    for abundance_idx, abundance in enumerate(ABUNDANCES):
        kallisto_path = f"{basepath}/{abundance}/kallisto"
        vlq_path = f"{basepath}/{abundance}/vlq"

        time = 0
        memory = 0
        with open(kallisto_path, "r") as f_in:
            for line in f_in:
                if "Command being timed:" in line:
                    seed = line.split(f"/tudelft.net/staff-umbrella/refsetopt/manuscript/SARS-CoV-2/tmp_kallisto_indices/estimation_results_15-08-2025/{experiment}/{method}/estimations/{abundance}/")[1].split()[0]
                    seed = int(seed)
                elif "User time (seconds)" in line:
                    time += float(line.strip().split()[-1])
                elif "System time (seconds)" in line:
                    time += float(line.strip().split()[-1])
                elif "Maximum resident set size (kbytes)" in line:
                    memory = max(memory, int(line.strip().split()[-1]) / (1024 ** 2))
                elif "Exit status" in line:
                    if int(line.strip().split()[-1]) == 0:
                        times[abundance_idx, seed-1] = time
                        memories[abundance_idx, seed-1] = memory
                    time = 0
                    memory = 0
        if vlq:
            time = 0
            memory = 0
            with open(vlq_path, "r") as f_in:
                for line in f_in:
                    if "Command being timed" in line:
                        seed = line.split(f"/tudelft.net/staff-umbrella/refsetopt/manuscript/SARS-CoV-2/tmp_kallisto_indices/estimation_results_15-08-2025/{experiment}/{method}/estimations/{abundance}/")[1].split("_")[0]
                        seed = int(seed)
                    elif "User time (seconds)" in line:
                        time += float(line.strip().split()[-1])
                    elif "System time (seconds)" in line:
                        time += float(line.strip().split()[-1])
                    elif "Maximum resident set size (kbytes)" in line:
                        memory = max(memory, int(line.strip().split()[-1]) / (1024 ** 2))
                    elif "Exit status" in line:
                        if int(line.strip().split()[-1]) == 0:
                            times[abundance_idx, seed-1] += time
                            memories[abundance_idx, seed-1] = max(memories[abundance_idx, seed-1], memory)
                        time = 0
                        memory = 0
    return times, memories

# Profiling removed from total times and memory
for method in METHODS:
    times, memory = read_runtimes(method, "baseline", vlq=False)
    #baseline[method]["time"] += np.mean(times)
    baseline[method]["profiling time"] = np.mean(times)
    #baseline[method]["memory"] = max(baseline[method]["memory"], np.max(memory))
    baseline[method]["profiling memory"] = np.max(memory)
    times, memory = read_runtimes(method, "continent", vlq=False)
    #continent[method]["time"] += np.mean(times)
    continent[method]["profiling time"] = np.mean(times)
    #continent[method]["memory"] = max(continent[method]["memory"], np.max(memory))
    continent[method]["profiling memory"] = np.max(memory)
    times, memory = read_runtimes(method, "country", vlq=False)
    #country[method]["time"] += np.mean(times)
    country[method]["profiling time"] = np.mean(times)
    #country[method]["memory"] = max(country[method]["memory"], np.max(memory))
    country[method]["profiling memory"] = np.max(memory)
    times, memory = read_runtimes(method, "state", vlq=False)
    #state[method]["time"] += np.mean(times)
    state[method]["profiling time"] = np.mean(times)
    #state[method]["memory"] = max(state[method]["memory"], np.max(memory))
    state[method]["profiling memory"] = np.max(memory)