# Imports

In [None]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.patches as mpatches

# Constants

In [None]:
OUTPUT_DIR = "figures/bacteria"

# Pre-processing
This fetches all of the species per experiment since selection was in principle ran for all bacterial species

In [None]:
genus_taxids = set()
family_taxids = set()
order_taxids = set()

for experiment in ["genus_experiments", "family_experiments", "order_experiments"]:
    filename = f"root/reference_sets/{experiment}/all.tsv"
    with open(filename, "r") as f_in:
        for line in f_in:
            kingdom, species, _, _ = line.strip().split("\t")
            if experiment == "genus_experiments":
                genus_taxids.add(int(species))
            elif experiment == "family_experiments":
                family_taxids.add(int(species))
            elif experiment == "order_experiments":
                order_taxids.add(int(species))

# MASH
**NOTE**: here we assume that mash was run for every species individually, and timed using the `/usr/bin/time -v` command, saving runtimes in `root/runtimes/mash_${species}`!

In [None]:
def read_runtimes_mash(genus_taxids, family_taxids, order_taxids):
    times = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    taxa = [int(f[len("mash_"):]) for f in os.listdir("root/runtimes") if f.startswith("mash_")]
    for species in taxa:
        file = f"root/runtimes/mash_{species}"
        with open(file, "r") as f_in:
            cur_time = 0
            cur_mem = 0
            for line in f_in:
                if "User time (seconds)" in line:
                    cur_time += float(line.strip().split()[-1])
                elif "System time (seconds)" in line:
                    cur_time += float(line.strip().split()[-1])
                elif "Maximum resident set size (kbytes)" in line:
                    cur_mem = max(cur_mem, float(line.strip().split()[-1]) / (1024**2))  # Convert kbytes to GB
        times["total"] += cur_time
        memory["total"] = max(memory["total"], cur_mem)
        if species in genus_taxids: #store only for species in experiment
            times["genus"] += cur_time
            memory["genus"] = max(memory["genus"], cur_mem)
        if species in family_taxids: #store only for species in experiment
            times["family"] += cur_time
            memory["family"] = max(memory["family"], cur_mem)
        if species in order_taxids: #store only for species in experiment
            times["order"] += cur_time
            memory["order"] = max(memory["order"], cur_mem)   

    return times, memory           


# Selection tools
**NOTE**: here we assume that selection was run for every species individually, and timed using the `/usr/bin/time -v -a` command, saving runtimes collectively in `root/runtimes/${method}_${threshold}`!

In [None]:
def read_runtimes_centroid(genus_taxids, family_taxids, order_taxids):
    times = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    runtime_per_taxid = {}
    memory_per_taxid = {}
    num_failed = 0

    file = f"root/runtimes/centroid"
    with open(file, "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {}
                line = line.strip().split("taxid")
                data["taxid"] = int(line[-1].split("\"")[0])
                cur_taxid = data["taxid"]
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line: #note that this is in kbytes! -> divide by 1024^2
                data["memory"] = float(line.strip().split()[-1])/(1024**2)
            elif "Exit status" in line:
                data["exit status"] = int(line.strip().split()[-1])
                if data["exit status"] == 0: #use latest available information
                    runtime_per_taxid[cur_taxid] = data["user time"] + data["system time"]
                    memory_per_taxid[cur_taxid] = data["memory"]
                else:
                    num_failed += 1

    for taxid in runtime_per_taxid:
        times["total"] += runtime_per_taxid[taxid]
        memory["total"] = max(memory["total"], memory_per_taxid[taxid])
        if taxid in genus_taxids:
            times["genus"] += runtime_per_taxid[taxid]
            memory["genus"] = max(memory["genus"], memory_per_taxid[taxid])
        if taxid in family_taxids:
            times["family"] += runtime_per_taxid[taxid]
            memory["family"] = max(memory["family"], memory_per_taxid[taxid])
        if taxid in order_taxids:
            times["order"] += runtime_per_taxid[taxid]
            memory["order"] = max(memory["order"], memory_per_taxid[taxid])

    return times, memory, num_failed

def read_runtimes_clustering(genus_taxids, family_taxids, order_taxids):
    # First process Single-linkage clustering
    times_SL = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory_SL = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    runtime_per_taxid_SL = {}
    memory_per_taxid_SL = {}
    num_failed_SL = 0
    file = f"root/runtimes/single-linkage" #threshold doesn't matter here since all refsets were generated at once
    with open(file, "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {}
                line = line.strip().split("taxid")
                data["taxid"] = int(line[-1].split("--")[0])
                cur_taxid = data["taxid"]
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line: #note that this is in kbytes! -> divide by 1024^2
                data["memory"] = float(line.strip().split()[-1])/(1024**2)
            elif "Exit status" in line:
                data["exit status"] = int(line.strip().split()[-1])
                if data["exit status"] == 0: #use latest available information
                    runtime_per_taxid_SL[cur_taxid] = data["user time"] + data["system time"]
                    memory_per_taxid_SL[cur_taxid] = data["memory"]
                else:
                    num_failed_SL += 1
    for taxid in runtime_per_taxid_SL:
        times_SL["total"] += runtime_per_taxid_SL[taxid]
        memory_SL["total"] = max(memory_SL["total"], memory_per_taxid_SL[taxid])
        if taxid in genus_taxids:
            times_SL["genus"] += runtime_per_taxid_SL[taxid]
            memory_SL["genus"] = max(memory_SL["genus"], memory_per_taxid_SL[taxid])
        if taxid in family_taxids:
            times_SL["family"] += runtime_per_taxid_SL[taxid]
            memory_SL["family"] = max(memory_SL["family"], memory_per_taxid_SL[taxid])
        if taxid in order_taxids:
            times_SL["order"] += runtime_per_taxid_SL[taxid]
            memory_SL["order"] = max(memory_SL["order"], memory_per_taxid_SL[taxid])

    # Then process Complete-linkage clustering
    times_CL = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory_CL = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    runtime_per_taxid_CL = {}
    memory_per_taxid = {}
    num_failed_CL = 0

    file = f"root/runtimes/single-linkage"
    with open(file, "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {}
                line = line.strip().split("taxid")
                data["taxid"] = int(line[-1].split("--")[0])
                cur_taxid = data["taxid"]
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line: #note that this is in kbytes! -> divide by 1024^2
                data["memory"] = float(line.strip().split()[-1])/(1024**2)
            elif "Exit status" in line:
                data["exit status"] = int(line.strip().split()[-1])
                if data["exit status"] == 0: #use latest available information
                    runtime_per_taxid_CL[cur_taxid] = data["user time"] + data["system time"]
                    memory_per_taxid[cur_taxid] = data["memory"]
                else:
                    num_failed_CL += 1
    for taxid in runtime_per_taxid_CL:
        times_CL["total"] += runtime_per_taxid_CL[taxid]
        memory_CL["total"] = max(memory_CL["total"], memory_per_taxid[taxid])
        if taxid in genus_taxids:
            times_CL["genus"] += runtime_per_taxid_CL[taxid]
            memory_CL["genus"] = max(memory_CL["genus"], memory_per_taxid[taxid])
        if taxid in family_taxids:
            times_CL["family"] += runtime_per_taxid_CL[taxid]
            memory_CL["family"] = max(memory_CL["family"], memory_per_taxid[taxid])
        if taxid in order_taxids:
            times_CL["order"] += runtime_per_taxid_CL[taxid]
            memory_CL["order"] = max(memory_CL["order"], memory_per_taxid[taxid])

    return times_SL, memory_SL, num_failed_SL, times_CL, memory_CL, num_failed_CL

def read_runtimes_ggrasp(genus_taxids, family_taxids, order_taxids):
    times = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    num_failed = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    passed = set()
    failed = set()
    runtime_per_taxid = {}
    memory_per_taxid = {}

    file = f"root/runtimes/GGRaSP"
    with open(file, "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {}
                line = line.strip().split(f"/tudelft.net/staff-umbrella/refsetbenchmark/species/2/")
                data["taxid"] = int(line[-1].split(":")[0])
                cur_taxid = data["taxid"]
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line:
                data["memory"] = float(line.strip().split()[-1])/(1024**2)  # Convert kbytes to GB
            elif "Exit status" in line:
                data["exit status"] = int(line.strip().split()[-1])
                if data["exit status"] == 0:# #use latest available information
                    runtime_per_taxid[cur_taxid] = data["user time"] + data["system time"]
                    memory_per_taxid[cur_taxid] = data["memory"]
                    passed.add(cur_taxid)
                else:
                    num_failed["total"] += 1
                    failed.add(cur_taxid)
                    if cur_taxid in genus_taxids:
                        num_failed["genus"] += 1
                    if cur_taxid in family_taxids:
                        num_failed["family"] += 1
                    if cur_taxid in order_taxids:
                        num_failed["order"] += 1  

    for taxid in passed:
        if taxid in failed:
            num_failed["total"] -= 1
            if taxid in genus_taxids:
                num_failed["genus"] -= 1
            if taxid in family_taxids:
                num_failed["family"] -= 1
            if taxid in order_taxids:
                num_failed["order"] -= 1 
        times["total"] += runtime_per_taxid[taxid]
        memory["total"] = max(memory["total"], memory_per_taxid[taxid])
        if taxid in genus_taxids:
            times["genus"] += runtime_per_taxid[taxid]
            memory["genus"] = max(memory["genus"], memory_per_taxid[taxid])
        if taxid in family_taxids:
            times["family"] += runtime_per_taxid[taxid]
            memory["family"] = max(memory["family"], memory_per_taxid[taxid])
        if taxid in order_taxids:
            times["order"] += runtime_per_taxid[taxid]
            memory["order"] = max(memory["order"], memory_per_taxid[taxid])

    return times, memory, num_failed

def read_runtimes_gclust(threshold, genus_taxids, family_taxids, order_taxids):
    times = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    num_failed = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    passed = set()
    failed = set()
    runtime_per_taxid = {}
    memory_per_taxid = {}

    file = f"root/runtimes/Gclust_{threshold}"
    with open(file, "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {}
                line = line.strip().split(f"root/genomes")
                data["taxid"] = int(line[-1].split("/")[0])
                cur_taxid = data["taxid"]
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line:
                data["memory"] = float(line.strip().split()[-1])/(1024**2)  # Convert kbytes to GB
            elif "Exit status" in line:
                data["exit status"] = int(line.strip().split()[-1])
                if data["exit status"] == 0:#use latest available information
                    runtime_per_taxid[cur_taxid] = data["user time"] + data["system time"]
                    memory_per_taxid[cur_taxid] = data["memory"]
                    passed.add(cur_taxid)
                else:
                    num_failed["total"] += 1
                    failed.add(cur_taxid)
                    if cur_taxid in genus_taxids:
                        num_failed["genus"] += 1
                    if cur_taxid in family_taxids:
                        num_failed["family"] += 1
                    if cur_taxid in order_taxids:
                        num_failed["order"] += 1

    for taxid in passed:
        if taxid in failed:
            num_failed["total"] -= 1
            if taxid in genus_taxids:
                num_failed["genus"] -= 1
            if taxid in family_taxids:
                num_failed["family"] -= 1
            if taxid in order_taxids:
                num_failed["order"] -= 1 
        times["total"] += runtime_per_taxid[taxid]
        memory["total"] = max(memory["total"], memory_per_taxid[taxid])
        if taxid in genus_taxids:
            times["genus"] += runtime_per_taxid[taxid]
            memory["genus"] = max(memory["genus"], memory_per_taxid[taxid])
        if taxid in family_taxids:
            times["family"] += runtime_per_taxid[taxid]
            memory["family"] = max(memory["family"], memory_per_taxid[taxid])
        if taxid in order_taxids:
            times["order"] += runtime_per_taxid[taxid]
            memory["order"] = max(memory["order"], memory_per_taxid[taxid])
    
    return times, memory, num_failed

def read_runtimes_meshclust(threshold, genus_taxids, family_taxids, order_taxids):
    times = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    memory = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    num_failed = {
        "total": 0,
        "genus": 0,
        "family": 0,
        "order": 0
    }
    passed = set()
    failed = set()
    runtime_per_taxid = {}
    memory_per_taxid = {}

    file = f"root/runtimes/MeShClust_{threshold}"
    with open(file, "r") as f_in:
        data = {}
        for line in f_in:
            if "Command being timed" in line:
                data = {}
                line = line.strip().split(f"root/genomes")
                data["taxid"] = int(line[-1].split("/")[0])
                cur_taxid = data["taxid"]
            elif "User time (seconds)" in line:
                data["user time"] = float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["system time"] = float(line.strip().split()[-1])
            elif "Maximum resident set size" in line:
                data["memory"] = float(line.strip().split()[-1])/(1024**2)  # Convert kbytes to GB
            elif "Exit status" in line:
                data["exit status"] = int(line.strip().split()[-1])
                if data["exit status"] == 0:#use latest available information
                    runtime_per_taxid[cur_taxid] = data["user time"] + data["system time"]
                    memory_per_taxid[cur_taxid] = data["memory"]
                    passed.add(cur_taxid)
                else:
                    num_failed["total"] += 1
                    failed.add(cur_taxid)
                    if cur_taxid in genus_taxids:
                        num_failed["genus"] += 1
                    if cur_taxid in family_taxids:
                        num_failed["family"] += 1
                    if cur_taxid in order_taxids:
                        num_failed["order"] += 1  

    for taxid in passed:
        if taxid in failed:
            num_failed["total"] -= 1
            if taxid in genus_taxids:
                num_failed["genus"] -= 1
            if taxid in family_taxids:
                num_failed["family"] -= 1
            if taxid in order_taxids:
                num_failed["order"] -= 1 
        times["total"] += runtime_per_taxid[taxid]
        memory["total"] = max(memory["total"], memory_per_taxid[taxid])
        if taxid in genus_taxids:
            times["genus"] += runtime_per_taxid[taxid]
            memory["genus"] = max(memory["genus"], memory_per_taxid[taxid])
        if taxid in family_taxids:
            times["family"] += runtime_per_taxid[taxid]
            memory["family"] = max(memory["family"], memory_per_taxid[taxid])
        if taxid in order_taxids:
            times["order"] += runtime_per_taxid[taxid]
            memory["order"] = max(memory["order"], memory_per_taxid[taxid])

    return times, memory, num_failed

# Index building
**NOTE**: here we assume that index building was timed using the `/usr/bin/time -v` command, saving runtimes collectively in `root/runtimes/indexing_${experiment}_${profiler}_${method}_${threshold}`!

In [None]:
def read_runtimes_kraken_indexing(experiment, method):
    file_path = f"root/runtimes/indexing_{experiment}_kraken_{method}"
    time = 0
    memory = 0
    with open(file_path, "r") as f_in:
        data = {"time": 0, "memory": 0}
        for line in f_in:
            if "User time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "Maximum resident set size (kbytes)" in line:
                data["memory"] = int(line.strip().split()[-1]) / (1024**2)
            elif "Exit status" in line:
                if int(line.strip().split()[-1]) == 0:
                    time += data["time"]
                    memory = max(memory, data["memory"])
                data = {"time": 0, "memory": 0}

    return time, memory

def read_runtimes_centrifuge_indexing(experiment, method):
    file_path = f"root/runtimes/indexing_{experiment}_centrifuge_{method}"
    time = 0
    memory = 0
    with open(file_path, "r") as f_in:
        data = {"time": 0, "memory": 0}
        for line in f_in:
            if "User time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "Maximum resident set size (kbytes)" in line:
                data["memory"] = int(line.strip().split()[-1]) / (1024**2)
            elif "Exit status" in line:
                if int(line.strip().split()[-1]) == 0:
                    time += data["time"]
                    memory = max(memory, data["memory"])
                data = {"time": 0, "memory": 0}

    return time, memory

def read_runtimes_dudes_indexing(experiment, method):
    file_path = f"root/runtimes/indexing_{experiment}_dudes_{method}"
    time = 0
    memory = 0
    with open(file_path, "r") as f_in:
        data = {"time": 0, "memory": 0}
        for line in f_in:
            if "User time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "Maximum resident set size (kbytes)" in line:
                data["memory"] = int(line.strip().split()[-1]) / (1024**2)
            elif "Exit status" in line:
                if int(line.strip().split()[-1]) == 0:
                    time += data["time"]
                    memory = max(memory, data["memory"])
                data = {"time": 0, "memory": 0}

    return time, memory

# Profiling
**NOTE**: here we assume that profiling was timed using the `/usr/bin/time -v -a` command, saving runtimes collectively (over all samples) in `root/runtimes/profiling_${experiment}_${profiler}_${method}_${threshold}`!

In [None]:
def read_runtimes_kraken_profiling(experiment, method):
    file_path = f"root/runtimes/profiling_{experiment}_kraken_{method}"
    times = []
    memory = 0
    with open(file_path, "r") as f_in:
        data = {"time": 0, "memory": 0}
        cur_command = None
        for line in f_in:
            if 'Command being timed:' in line:
                if "kraken2 --db" in line:
                    cur_command = "kraken"
                else:
                    cur_command = "bracken"
            if "User time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "Maximum resident set size (kbytes)" in line:
                data["memory"] = max(data["memory"], int(line.strip().split()[-1]) / (1024**2))
            elif "Exit status" in line and cur_command == "bracken": #store collective time and memory for kraken+bracken
                if int(line.strip().split()[-1]) == 0:
                    times.append(data["time"])
                    memory = max(memory, data["memory"])
                data = {"time": 0, "memory": 0}
                cur_command = None

    return times, memory

def read_runtimes_centrifuge_profiling(experiment, method):
    file_path = f"root/runtimes/profiling_{experiment}_centrifuge_{method}"
    times = []
    memory = 0
    with open(file_path, "r") as f_in:
        data = {"time": 0, "memory": 0}
        for line in f_in:
            if "User time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "Maximum resident set size (kbytes)" in line:
                data["memory"] = int(line.strip().split()[-1]) / (1024**2)
            elif "Exit status" in line:
                if int(line.strip().split()[-1]) == 0:
                    times.append(data["time"])
                    memory = max(memory, data["memory"])
                data = {"time": 0, "memory": 0}

    return times, memory

def read_runtimes_dudes_profiling(experiment, method):
    file_path = f"root/runtimes/profiling_{experiment}_dudes_{method}"
    times = []
    memory = 0
    with open(file_path, "r") as f_in:
        data = {"time": 0, "memory": 0}
        cur_command = False
        for line in f_in:
            if "Command being timed:" in line:
                if "dudes -s" in line and "-l species" in line:
                    cur_command = True #after the dudes command, we have total profiling time and memory (BWA + DUDes)
            if "User time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "System time (seconds)" in line:
                data["time"] += float(line.strip().split()[-1])
            elif "Maximum resident set size (kbytes)" in line:
                data["memory"] = max(data["memory"], int(line.strip().split()[-1]) / (1024**2))
            elif "Exit status" in line and cur_command:
                if int(line.strip().split()[-1]) == 0:
                    times.append(data["time"])
                    memory = max(memory, data["memory"])
                data = {"time": 0, "memory": 0}
                cur_command = False

    return times, memory

# Fetch runtimes and memory

In [None]:
# First determine proportions of runtimes (i.e. how much of total time is spent on steps)
mash_steps = [
    "centroid",
    "ggrasp",
    "single-linkage",
    "complete-linkage"
]

METHODS = ["all", "centroid", "ggrasp", "meshclust_0.95", "meshclust_0.97", "meshclust_0.99", "gclust_0.95", "gclust_0.97", "gclust_0.99", "single-linkage_0.95", "single-linkage_0.97", "single-linkage_0.99", "complete-linkage_0.95", "complete-linkage_0.97", "complete-linkage_0.99"]
METHOD_LABELS = {
    "all": "All",
    "centroid": "Centroid",
    "ggrasp": "GGRaSP",
    "meshclust_0.95": "MC-0.95",
    "meshclust_0.97": "MC-0.97",
    "meshclust_0.99": "MC-0.99",
    "gclust_0.95": "GC-0.95",
    "gclust_0.97": "GC-0.97",
    "gclust_0.99": "GC-0.99",
    "single-linkage_0.95": "SL-0.95",
    "single-linkage_0.97": "SL-0.97",
    "single-linkage_0.99": "SL-0.99",
    "complete-linkage_0.95": "CL-0.95",
    "complete-linkage_0.97": "CL-0.97",
    "complete-linkage_0.99": "CL-0.99"
}
EXPERIMENTS = ["genus_experiments", "family_experiments", "order_experiments"]


compared_to_all = []
for profiler in ["kraken", "centrifuge", "dudes"]:
    times_selection = {}
    memory_selection = {}
    num_failed = {}
    # Calculate runtimes for selection
    times_selection["mash"], memory_selection["mash"] = read_runtimes_mash(genus_taxids, family_taxids, order_taxids)
    times_selection["centroid"], memory_selection["centroid"], num_failed["centroid"] = read_runtimes_centroid(genus_taxids, family_taxids, order_taxids)
    times_selection["single-linkage"], memory_selection["single-linkage"], num_failed["single-linkage"], times_selection["complete-linkage"], memory_selection["complete-linkage"], num_failed["complete-linkage"] = read_runtimes_clustering(genus_taxids, family_taxids, order_taxids)
    times_selection["ggrasp"], memory_selection["ggrasp"], num_failed["ggrasp"] = read_runtimes_ggrasp(genus_taxids, family_taxids, order_taxids)
    times_selection["gclust_0.95"], memory_selection["gclust_0.95"], num_failed["gclust_0.95"] = read_runtimes_gclust(95, genus_taxids, family_taxids, order_taxids)
    times_selection["gclust_0.97"], memory_selection["gclust_0.97"], num_failed["gclust_0.97"] = read_runtimes_gclust(97, genus_taxids, family_taxids, order_taxids)
    times_selection["gclust_0.99"], memory_selection["gclust_0.99"], num_failed["gclust_0.99"] = read_runtimes_gclust(99, genus_taxids, family_taxids, order_taxids)
    times_selection["meshclust_0.95"], memory_selection["meshclust_0.95"], num_failed["meshclust_0.95"] = read_runtimes_meshclust(95, genus_taxids, family_taxids, order_taxids)
    times_selection["meshclust_0.97"], memory_selection["meshclust_0.97"], num_failed["meshclust_0.97"] = read_runtimes_meshclust(97, genus_taxids, family_taxids, order_taxids)
    times_selection["meshclust_0.99"], memory_selection["meshclust_0.99"], num_failed["meshclust_0.99"] = read_runtimes_meshclust(99, genus_taxids, family_taxids, order_taxids)

    times_indexing = {}
    memory_indexing = {}
    # Calculate runtimes for indexing
    for experiment in tqdm(EXPERIMENTS):
        times_indexing[experiment] = {}
        memory_indexing[experiment] = {}
        for method in METHODS:
            if profiler == "kraken":
                times_indexing[experiment][method], memory_indexing[experiment][method] = read_runtimes_kraken_indexing(experiment, method)
            elif profiler == "centrifuge":
                times_indexing[experiment][method], memory_indexing[experiment][method] = read_runtimes_centrifuge_indexing(experiment, method)
            elif profiler == "dudes":
                times_indexing[experiment][method], memory_indexing[experiment][method] = read_runtimes_dudes_indexing(experiment, method)

    times_profiling = {}
    memory_profiling = {}
    # Calculate runtimes for profiling
    for experiment in tqdm(EXPERIMENTS):
        times_profiling[experiment] = {}
        memory_profiling[experiment] = {}
        for method in METHODS:
            if profiler == "kraken":
                times_profiling[experiment][method], memory_profiling[experiment][method] = read_runtimes_kraken_profiling(experiment, method)
            elif profiler == "centrifuge":
                times_profiling[experiment][method], memory_profiling[experiment][method] = read_runtimes_centrifuge_profiling(experiment, method)
            elif profiler == "dudes":
                times_profiling[experiment][method], memory_profiling[experiment][method] = read_runtimes_dudes_profiling(experiment, method)
            times_profiling[experiment][method] = np.mean(times_profiling[experiment][method])

    for experiment in EXPERIMENTS:
        level = experiment.split("_")[0]
        print(profiler, experiment)
        for method in METHODS:
            # selection
            if method in mash_steps:
                total_selection = times_selection["mash"][level]
                total_selection_mem = memory_selection["mash"][level]
            else:
                total_selection = 0
                total_selection_mem = 0
            try:
                total_selection += times_selection[method][level]
                total_selection_mem = max(total_selection_mem, memory_selection[method][level])
            except KeyError:
                try:
                    total_selection += times_selection['-'.join(method.split("-")[:-1])][level]
                    total_selection_mem = max(total_selection_mem, memory_selection['-'.join(method.split("-")[:-1])][level])
                except: #all selection does not have a selection time
                    total_selection += 0
                    total_selection_mem = max(total_selection_mem, 0)
            # indexing
            total_indexing = times_indexing[experiment][method]
            total_indexing_mem = memory_indexing[experiment][method]

            # profiling (average)
            total_profiling = times_profiling[experiment][method]
            total_profiling_mem = memory_profiling[experiment][method]

            #total_time = total_selection + total_indexing + total_profiling #this includes profiling
            total_time = total_selection + total_indexing  #this excludes profiling
            compared_to_all.append({
                "profiler": profiler,
                "experiment": experiment,
                "method": method,
                "threshold": method.split("-")[-1] if len(method.split("-")) > 1 else "-",
                "total_time": total_time,
                "profiling_time": total_profiling,
                #"memory": max(total_selection_mem, total_indexing_mem, total_profiling_mem),
                "memory": max(total_selection_mem, total_indexing_mem),
                "profiling_memory": total_profiling_mem,
                "relative_to_all": total_time / (times_profiling[experiment]["all"] + times_indexing[experiment]["all"]),
                "mem_relative_to_all": max(total_selection_mem, total_indexing_mem, total_profiling_mem) / max(memory_indexing[experiment]["all"], memory_profiling[experiment]["all"]),
            })
            print(f"Method={method}, selection={total_selection/total_time*100:.2f}%, indexing={total_indexing/total_time*100:.2f}%, profiling={total_profiling/total_time*100:.2f}%")

# Actual plotting

In [None]:
clustering_methods = [
    "centroid",
    "ggrasp",
    "single-linkage_0.95",
    "single-linkage_0.97",
    "single-linkage_0.99",
    "complete-linkage_0.95",
    "complete-linkage_0.97",
    "complete-linkage_0.99",
]
meshclust_methods = [
    "meshclust_0.95",
    "meshclust_0.97",
    "meshclust_0.99",
]
gclust_methods = [
    "gclust_0.95",
    "gclust_0.97",
    "gclust_0.99",
]
method_to_group = {}
for method in clustering_methods:
    method_to_group[method] = "Hierarchical\nclustering"
for method in meshclust_methods:
    method_to_group[method] = "MeShClust"
for method in gclust_methods:
    method_to_group[method] = "Gclust"
method_to_group["all"] = "All"

# Create copy and add column for method groups
df = pd.DataFrame(compared_to_all)
df_copy = df.copy()
if "method_group" not in df_copy.columns:
    df_copy = df_copy.assign(method_group=df_copy["method"].map(method_to_group))
df_copy.rename(columns={"method_group": "Method group"}, inplace=True)
#df_copy["relative_to_all"] *= 100
#df_copy["mem_relative_to_all"] *= 100
print(df_copy.columns)

group_order = ["Hierarchical\nclustering", "MeShClust", "Gclust", "All"]
exp_order = ["genus_experiments", "family_experiments", "order_experiments"]
for experiment in exp_order:
    level = experiment.split("_")[0]
    cur_df = df_copy[df_copy["experiment"] == experiment]
    fig = plt.figure(figsize=(8, 6))
    
    ax = sns.boxplot(
        data=cur_df,
        x="Method group",
        y="profiling_time",
        order=group_order,
        dodge=True,
        width=0.8,
        gap=0.05,
        patch_artist=True,
        boxprops=dict(facecolor='silver', edgecolor="black", alpha=0.6),
        medianprops=dict(color='black'),
        whiskerprops=dict(color='black'),
        capprops=dict(color='black'),
        flierprops=dict(marker=''),
    )
    sns.stripplot(
        data=cur_df,
        x="Method group",
        y="profiling_time",
        hue="profiler",
        order=group_order,
        size=10,
        alpha=0.75,
        ax=ax,
        dodge=False,
        edgecolor="black",
        linewidth=1,
        jitter=0.2,
    )
    
    plt.title(experiment, fontsize=20)
    plt.yticks(fontsize=13)
    plt.ylabel("", fontsize=16)
    plt.ylim(bottom=min(df_copy["profiling_time"].min()*0.8, 1), top=10**np.ceil(np.log10(df_copy["profiling_time"].max())))
    plt.xticks(fontsize=15)
    plt.xlabel("")
    ax.set_yscale("log", base=10)
    plt.tight_layout()
    ax.grid(axis="y", which="both", linestyle="--", linewidth=0.5, alpha=0.3)
    ax.grid(axis="y", which="major", linestyle="--", linewidth=1.5, alpha=0.8)
    # This creates the plots in Figure 7
    fig.savefig(f"{OUTPUT_DIR}/{experiment}/runtimes_profiling.svg", dpi=500, bbox_inches="tight")


    fig = plt.figure(figsize=(8, 6))
    ax = sns.boxplot(
        data=cur_df,
        x="Method group",
        y="profiling_memory",
        order=group_order,
        dodge=True,
        width=0.8,
        gap=0.05,
        patch_artist=True,
        boxprops=dict(facecolor='silver', edgecolor="black", alpha=0.6),
        medianprops=dict(color='black'),
        whiskerprops=dict(color='black'),
        capprops=dict(color='black'),
        flierprops=dict(marker=''),
    )
    sns.stripplot(
        data=cur_df,
        x="Method group",
        y="profiling_memory",
        hue="profiler",
        order=group_order,
        size=10,
        alpha=0.75,
        ax=ax,
        dodge=False,
        edgecolor="black",
        linewidth=1,
        jitter=0.2,
    )
    plt.title(experiment, fontsize=20)
    plt.yticks(fontsize=13)
    plt.ylabel("", fontsize=16)
    plt.ylim(bottom=min(df_copy["profiling_memory"].min()*0.8, 1), top=10**np.ceil(np.log10(df_copy["profiling_memory"].max())))
    plt.xticks(fontsize=15)
    plt.xlabel("")
    ax.set_yscale("log", base=10)
    plt.tight_layout()
    ax.grid(axis="y", which="both", linestyle="--", linewidth=0.5, alpha=0.3)
    ax.grid(axis="y", which="major", linestyle="--", linewidth=1.5, alpha=0.8)
    # This creates the plots in Supplementary Figure S8
    fig.savefig(f"{OUTPUT_DIR}/{experiment}/memory_profiling.svg", dpi=500, bbox_inches="tight")
