# Getting started
**NOTE**: we assume that the groundtruths are known since we base our results on simulations. In this, we assume that the user has access to a pair of files for every sample called `sample_sampleid_groundtruth_reads.tsv` and `sample_sampleid_groundtruth_folds` located in the corresponding sample folder. These files are provided on [Zenodo](https://doi.org/10.5281/zenodo.14727633) for the experiments in our manuscript and have for every line a species taxid followed by the relative abundance (either read-based or fold-based), delimited by a tab. For the read-based groundtruths a third column is added with the absolute number of reads of every species.

In [None]:
def read_groundtruth_reads(sample_number):
    abundances = {}
    read_totals = {}
    with open(f"samples/sample_{sample_number}/sample_{sample_number}_groundtruth_reads.tsv", "r") as f_in:
        for line in f_in:
            line = line.strip().split("\t")
            abundances[int(line[0])] = float(line[1])
            read_totals[int(line[0])] = int(line[2])
    return abundances, read_totals

def read_groundtruth_folds(sample_number):
    abundances = {}
    with open(f"samples/sample_{sample_number}/sample_{sample_number}_groundtruth_folds.tsv", "r") as f_in:
        for line in f_in:
            line = line.strip().split("\t")
            abundances[int(line[0])] = float(line[1])
    return abundances

# Gather groundtruth for example sample_1
groundtruth_reads, groundtruth_totals = read_groundtruth_reads(1)
groundtruth_folds = read_groundtruth_folds(1)

## Kraken2 + Bracken

In [None]:
def process_bracken(method_threshold, sample_number, groundtruth_reads):
    sample_species = list(groundtruth_reads.keys()) #all species taxids of species that er IN sample
    relevant_species = set(sample_species)

    metrics = {"l1": 0, "f1": 0, "unclassified": sum([groundtruth_reads[s] for s in groundtruth_reads])}
    # F1-related metrics
    tp = 0 
    fp = 0
    fn = 0
    predictions = {} #predicted abundance for every species
    errors = {}
    total_read_abundance = 0

    with open(f"estimations/sample_{sample_number}/kraken2-bracken/{method_threshold}.bracken", "r") as f_in:
        next(f_in) #skip header
        for line in f_in:
            line = line.strip().split("\t")
            cur_taxid = int(line[1])
            relevant_species.add(cur_taxid) #keep track of all species with abundance in predictions or groundtruth
            cur_reads = int(line[5]) #number of reads assigned to cur_taxid
            predictions[cur_taxid] = cur_reads
            total_read_abundance += cur_reads
            metrics["unclassified"] -= cur_reads

    # Filter results based on 0.1% minimum required abundance
    filtered_total_read_abundance = 0
    filtered_read_predictions = {}
    for s in relevant_species:
        try:
            cur_abundance = predictions[s] / total_read_abundance
            if cur_abundance >= 0.001:
                filtered_read_predictions[s] = predictions[s]
                filtered_total_read_abundance += predictions[s]
            else:
                filtered_read_predictions[s] = 0
        except: #it can happen that some species in the sample is not estimated
            cur_abundance = 0
            filtered_read_predictions[s] = 0
    
    # Calculate errors
    for s in relevant_species:
        if s in sample_species:
            errors[s] = abs( (filtered_read_predictions[s]/filtered_total_read_abundance) - groundtruth_reads[s] )
            if filtered_read_predictions[s] > 0:
                tp += 1
            else:
                fn += 1
        else:
            errors[s] = abs( filtered_read_predictions[s]/filtered_total_read_abundance )
            if filtered_read_predictions[s] > 0:
                fp += 1
        metrics["l1"] += errors[s]

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        metrics["f1"] = 2 * ((precision * recall) / (precision + recall))
    except:
        metrics["f1"] = 0

    return metrics

## Centrifuge
Note that for the Centrifuge results, we used the number of uniquely mapped reads to estimate relative abundances since the other results strongly deviated from the groundtruth.

In [None]:
def process_centrifuge(method_threshold, sample_number, groundtruth_reads):
    sample_species = list(groundtruth_reads.keys()) #all species taxids of species that er IN sample
    relevant_species = set(sample_species)

    metrics = {"l1": 0, "f1": 0} #unclassified reads for Centrifuge is not calculated here but is based on the .sam file
    # F1-related metrics
    tp = 0 
    fp = 0
    fn = 0
    predictions = {} #predicted abundance for every species
    errors = {}
    total_read_abundance = 0

    with open(f"estimations/sample_{sample_number}/centrifuge/{method_threshold}.report", "r") as f_in:
        next(f_in) #skip header
        for line in f_in:
            line = line.strip().split("\t")
            cur_taxid = int(line[1])
            relevant_species.add(cur_taxid) #keep track of all species with abundance in predictions or groundtruth
            cur_level = line[2]
            if cur_level == "species":
                cur_reads = int(line[5]) #number of reads UNIQUELY assigned to cur_taxid
                predictions[cur_taxid] = cur_reads
                total_read_abundance += cur_reads

    # Filter results based on 0.1% minimum required abundance
    filtered_total_read_abundance = 0
    filtered_read_predictions = {}
    for s in relevant_species:
        try:
            cur_abundance = predictions[s] / total_read_abundance
            if cur_abundance >= 0.001:
                filtered_read_predictions[s] = predictions[s]
                filtered_total_read_abundance += predictions[s]
            else:
                filtered_read_predictions[s] = 0
        except: #it can happen that some species in the sample is not estimated
            cur_abundance = 0
            filtered_read_predictions[s] = 0
    
    # Calculate errors
    for s in relevant_species:
        if s in sample_species:
            errors[s] = abs( (filtered_read_predictions[s]/filtered_total_read_abundance) - groundtruth_reads[s] )
            if filtered_read_predictions[s] > 0:
                tp += 1
            else:
                fn += 1
        else:
            errors[s] = abs( filtered_read_predictions[s]/filtered_total_read_abundance )
            if filtered_read_predictions[s] > 0:
                fp += 1
        metrics["l1"] += errors[s]

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        metrics["f1"] = 2 * ((precision * recall) / (precision + recall))
    except:
        metrics["f1"] = 0

    return metrics

## BWA + DUDes
Note that DUDes estimates FOLD-based abundance (aka taxonomic abundance) rather than read-based abundance

In [None]:
def process_dudes(method_threshold, sample_number, groundtruth_folds):
    sample_species = list(groundtruth_reads.keys()) #all species taxids of species that er IN sample
    relevant_species = set(sample_species)

    metrics = {"l1": 0, "f1": 0} #unclassified reads for DUDes is not calculated here but is based on the .sam file produced by BWA
    # F1-related metrics
    tp = 0 
    fp = 0
    fn = 0
    predictions = {} #predicted abundance for every species
    errors = {}
    total_abundance = 0

    with open(f"estimations/sample_{sample_number}/bwa-dudes/{method_threshold}_dudes.out", "r") as f_in:
        for _ in range(6): #skip preamble
            next(f_in)
        for line in f_in:
            line = line.strip().split("\t")
            cur_taxid = int(line[0])
            relevant_species.add(cur_taxid) #keep track of all species with abundance in predictions or groundtruth
            cur_level = line[1]
            if cur_level == "species":
                cur_abundance = float(line[-1]) / 100
                predictions[cur_taxid] = cur_abundance
                total_abundance += cur_abundance

    if total_fold_abundance > 0: #in the centroid reference sets, no estimates were generated at the species level
        # Filter results based on 0.1% minimum required abundance
        filtered_predictions = {}
        filtered_total_abundance = 0
        for s in relevant_species:
            try:
                cur_abundance = predictions[s] / total_abundance
                if cur_abundance >= 0.001:
                    filtered_predictions[s] = predictions[s]
                    filtered_total_abundance += predictions[s]
                else:
                    filtered_predictions[s] = 0
            except: #it can happen that some species in the sample is not estimated
                cur_abundance = 0
                filtered_predictions[s] = 0

        # Calculate errors
        for s in relevant_species:
            if s in sample_species:
                errors[s] = abs( (filtered_predictions[s]/filtered_total_abundance) - groundtruth_folds[s] )
                if filtered_predictions[s] > 0:
                    tp += 1
                else:
                    fn += 1
            else:
                errors[s] = abs( (filtered_predictions[s]/filtered_total_abundance) )
                if filtered_predictions[s] > 0:
                    fp += 1
            metrics["l1"] += errors[s]
    
        try:
            precision = tp / (tp + fp)
        except:
            precision = 0
        try:
            recall = tp / (tp + fn)
        except:
            recall = 0
        try:
            metrics["f1"] = 2 * ((precision * recall) / (precision + recall))
        except:
            metrics["f1"] = 0
    else:
        metrics["l1"] = 2
        metrics["f1"] = 0

    return metrics