# Getting started
**NOTE**: we assume that the groundtruths are known since we base our results on simulations. In this, we assume that the user has access to a file for every sample called `sample_sampleid_groundtruth.tsv` located in the corresponding sample folder. These groundtruth files are provided on [Zenodo](https://doi.org/10.5281/zenodo.14727633) for the experiments (folder structure: groundtruth_reads_SARS-CoV-2/Proximity-America_baseline/$abundance/$sample/groundtruth.tsv) in our manuscript and have for every line a lineage followed by the relative abundance.

In [None]:
def read_groundtruth(sample_number):
    abundances = {}
    read_totals = {}
    with open(f"samples/sample_{sample_number}/sample_{sample_number}_groundtruth.tsv", "r") as f_in:
        for line in f_in:
            line = line.strip().split("\t")
            abundances[line[0]] = float(line[1])
    return abundances

# Gather groundtruth for example sample_1
groundtruth = read_groundtruth(1)

## VLQ output

In [None]:
def process_bracken(method_threshold, sample_number, groundtruth):
    sample_lineages = list(groundtruth.keys()) #all species taxids of species that er IN sample
    relevant_lineages = set(sample_lineages)

    metrics = {"l1": 0, "f1": 0}
    # F1-related metrics
    tp = 0 
    fp = 0
    fn = 0
    predictions = {} #predicted abundance for every species
    errors = {}
    total_abundance = 0

    with open(f"estimations/sample_{sample_number}/kallisto/{method_threshold}/abundance.tsv", "r") as f_in:
        next(f_in) #skip header
        next(f_in) #skip header
        next(f_in) #skip header
        for line in f_in:
            line = line.strip().split("\t")
            cur_lineage = line[0]
            relevant_lineages.append(line[0])
            predictions[cur_lineage] = float(line[-1]) #we ran VLQ with a minimal required abundance of 0.1%
            total_abundance += float(line[-1])

    # Rescale estimations to enforce 100% total abundance
    for lineage in predictions:
        predictions[lineage] = predictions[lineage] * (100 / total_abundance)

    # Calculate errors
    for lineage in relevant_lineages:
        if lineage in sample_lineages:
            errors[lineage] = abs( (predictions[lineage] - groundtruth[lineage]) ) / 100
            if predictions[lineage] > 0:
                tp += 1
            else:
                fn += 1
        else:
            errors[lineage] = abs( predictions[lineage] ) / 100
            if predictions[lineage] > 0:
                fp += 1
        metrics["l1"] += errors[lineage]

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        metrics["f1"] = 2 * ((precision * recall) / (precision + recall))
    except:
        metrics["f1"] = 0

    return metrics