# Summarising compute speed

`benchmark.py` was run against the REFSOIL collection -- 960 whole bacterial genomes. We separately recorded times to:
- load the sequences from compressed files and convert them into  `SeqRecord` objects
- identify the divergent set

Each condition was run 5 times with the sequences were randomly drawn without replacement.

# Synopsis

Performance is approximately linear with the number of sequences.

**TODO** actually plot the best fit line, not my little hack.

In [None]:
from collections import defaultdict
from cogent3 import load_table, make_table

import plotly.graph_objects as go

In [None]:
def summarise(table):
    columns = "k", "num_seqs"
    distinct = table.distinct_values(columns)
    data = defaultdict(list)
    for k, num_reps in distinct:
        subtable = table.filtered(lambda x: tuple(x) == (k, num_reps), columns=columns)
        time_s2r = subtable.columns["time(s2r)"]
        time_maxd = subtable.columns["time(maxd)"]
        data["k"].append(k)
        data["num_seqs"].append(num_reps)
        data["mean(s2r)"].append(time_s2r.mean())
        data["stdev(s2r)"].append(time_s2r.std(ddof=1))
        data["mean(maxd)"].append(time_maxd.mean())
        data["stdev(maxd)"].append(time_maxd.std(ddof=1))
    return make_table(data=data).sorted(columns=columns)


table = load_table("benchmark.tsv")
summary = summarise(table)
summary

In [None]:
def get_fig(table, k, stat, title):
    mers = table.filtered(f"k == {k}")
    nseqs = mers.columns["num_seqs"]
    stats = mers.columns[f"mean({stat})"]
    tr_data = go.Scatter(
        name="observed",
        x=nseqs,
        y=stats,
        error_y=dict(
            type="data",  # value of error bar given in data coordinates
            array=mers.columns[f"stdev({stat})"],
            visible=True,
        ),
    )
    tr_diag = go.Scatter(
        name="linear",
        x=(nseqs.min(), nseqs.max()),
        y=(stats.min(), stats.max()),
    )
    return go.Figure(
        data=[tr_data, tr_diag],
        layout=dict(
            title=title,
            xaxis=dict(title="number of sequences"),
            yaxis=dict(title="mean time (seconds)"),
            width=500,
            height=500,
        ),
    )

In [None]:
fig = get_fig(summary, 3, "s2r", "Converting sequences to records")
fig.show()

# Identifying the maximally divergent set

In [None]:
fig = get_fig(summary, 7, "maxd", "Identifying the most divergent sequences")
fig.show()