# marti benchmarking

This notebook will use the Callao outputs generated in notebook 03 (monomer analysis) to compare and benchmark `marti` against a few other tools. We are using the monomer data because we will be running both `marti` and `restrander` twice on all the data, and 3 out of 4 runs will be on a single thread for benchmarking purposes. Running it on the smaller dataset will make this faster, but the same comparison could be performed on the MAS-seq data just the same.

`restrander` can be installed by following the instructions in their github: https://github.com/mritchielab/restrander?tab=readme-ov-file#installation

In [None]:
import re
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

import yaml

from mdl.sc_isoform_paper import today
from mdl.sc_isoform_paper.constants import MONOMER_KEYS
from mdl.sc_isoform_paper.marti import CONFIG_DICT, SAMPLE_CONFIG
from mdl.sc_isoform_paper.marti_benchmark import summarize_input, summarize_many, collapse_marti_ct, plot_absolute, plot_heatmap, plot_upset_from_ct, plot_upset_from_per_input_df
import warnings

In [None]:
root_dir = Path.home()
sh_dir = root_dir / "sh_scripts"

data_path = root_dir / "data" / "monomer"
figure_path = root_dir / "202501_figures"

# path to the marti binary
marti_bin = root_dir / "marti/build/bin/marti"

# path to restrander binary
restrander_dir = root_dir / "restrander"
restrander_bin = restrander_dir / "restrander"
restrander_config = restrander_dir / "config"
restrander_path = data_path / f"{today}_restrander_benchmarks"

# path for marti output
marti_path = data_path / f"{today}_marti_benchmarks"
marti_path.mkdir(exist_ok=True)

# path for GNU Time output
logs_dir = root_dir / "benchmark_logs"
logs_dir.mkdir(exist_ok=True)

# paths for input files, the output from callao
callao_bams = sorted((data_path / "callao").glob("*bam"))
callao_bams


In [None]:
default_polya_len = { "min_polyA_match": 20 }
single_core = {"n_threads": 1}

RESTRANDER_CONFIG = {
    "PIPseq": restrander_config / "PIPseq.json",  # does not exist, we will create it
    "10x 3'": restrander_config / "10X-5prime.json",
    "10x 5'": restrander_config / "10X-3prime.json",
}

# Since restrander is initially intended for ONT data, the error rate in their configs are rather high,
# so we will create an alternate version of the provided 10X configs where we reduce the error rate to 0.05, 
# raise the polyA length to 20, and set the search size to 150 to match settings with marti
RESTRANDER_PB_CONFIG = {
    "PIPseq": restrander_config / "PIPseq_PB.json",
    "10x 3'": restrander_config / "10X-5prime_PB.json",
    "10x 5'": restrander_config / "10X-3prime_PB.json",
}

In [None]:
with open(RESTRANDER_CONFIG["PIPseq"], "w") as out:
    out.write("""{
    "name": "PIPseq",
    "description": "Uses PIPseq primers.",
    "pipeline": [
        {
            "type": "poly",
            "tail-length": 12,
            "search-size": 200
        },
        {
            "type": "primer",
            "tso": "AAGCAGTGGTATCAACGCAGAG",
            "rtp": "CTACACGACGCTCTTCCGATCT",
            "report-artefacts": true
        }
    ],
    "silent": false,
    "exclude-unknowns": true,
    "error-rate": 0.25
}
""")

with open(RESTRANDER_PB_CONFIG["PIPseq"], "w") as out:
    out.write("""{
    "name": "PIPseq PB",
    "description": "Uses PIPseq primers.",
    "pipeline": [
        {
            "type": "poly",
            "tail-length": 20,
            "search-size": 150
        },
        {
            "type": "primer",
            "tso": "AAGCAGTGGTATCAACGCAGAG",
            "rtp": "CTACACGACGCTCTTCCGATCT",
            "report-artefacts": true
        }
    ],
    "silent": false,
    "exclude-unknowns": true,
    "error-rate": 0.05
}
""")

with open(RESTRANDER_PB_CONFIG["10x 3'"], "w") as out:
    out.write("""{
    "name": "10X 3' PB",
    "description": "Uses 10X 3' primers.",
    "pipeline": [
        {
            "type": "poly",
            "tail-length": 20,
            "search-size": 150
        },
        {
            "type": "primer",
            "tso": "AAGCAGTGGTATCAACGCAGAGTAC",
            "rtp": "CTACACGACGCTCTTCCGATCT",
            "report-artefacts": true
        }
    ],
    "silent": false,
    "exclude-unknowns": true,
    "error-rate": 0.05
}
""")

with open(RESTRANDER_PB_CONFIG["10x 5'"], "w") as out:
    out.write("""{
    "name": "10X 5' PB",
    "description": "Uses 10X 5' primers.",
    "pipeline": [
        {
            "type": "poly",
            "tail-length": 20,
            "search-size": 150
        },
        {
            "type": "primer",
            "tso": "CTACACGACGCTCTTCCGATCT",
            "rtp": "AAGCAGTGGTATCAACGCAGAGTAC",
            "report-artefacts": true
        }
    ],
    "silent": false,
    "exclude-unknowns": true,
    "error-rate": 0.05
}
""")

Prepare the Marti run configs and commands. This is basically the same runs as intially done in notebook 03, but we measure resources and time with `GNU Time`.
Besides that, to run `restrander` we need FASTQs, so we will extract them from the BAMs using samtools.

In [None]:
with open(sh_dir / f"{today}_monomer_marti_benchmark.sh", "w") as out, \
     open(sh_dir / f"{today}_monomer_marti_8t_benchmark.sh", "w") as out_8t, \
     open(sh_dir / f"{today}_monomer_fastq_restrander.sh", "w") as out2, \
     open(sh_dir / f"{today}_monomer_restrander_benchmark.sh", "w") as out3, \
     open(sh_dir / f"{today}_monomer_restrander_PB_benchmark.sh", "w") as out4:

    for cb in callao_bams:
        i = int(cb.name.split(".")[2])
        mp = marti_path / cb.stem
    
        # make a run directory for each file
        mp.mkdir(exist_ok=True, parents=True)
        config_file = mp / "config.yaml"

        # write config file with appropriate parameters
        with open(config_file, "w") as out_yaml:
            print(
                yaml.dump(
                    {"input_bam": str(cb)}
                    | SAMPLE_CONFIG[MONOMER_KEYS[i][0]]
                    | CONFIG_DICT
                    | default_polya_len 
                    | single_core,
                    sort_keys=False
                ),
                file=out_yaml
            )

        print(f"/usr/bin/time -v {marti_bin} {config_file} 2> {logs_dir}/time_marti_{cb.stem}.log",
              file=out)

        mp = marti_path / (cb.stem + "_8t")
        mp.mkdir(exist_ok=True, parents=True)
        config_file = mp / "config.yaml"
        with open(config_file, "w") as out_yaml:
            print(
                yaml.dump(
                    {"input_bam": str(cb)}
                    | SAMPLE_CONFIG[MONOMER_KEYS[i][0]]
                    | CONFIG_DICT
                    | default_polya_len,
                    sort_keys=False
                ),
                file=out_yaml
            )

        print(f"/usr/bin/time -v {marti_bin} {config_file} 2> {logs_dir}/time_marti_8t_{cb.stem}.log",
              file=out_8t)

        print(f"samtools fastq -@ 2 {mp}/{cb.stem}.classified.bam -0 {fastq_path}/{cb.stem}.marti_tags.fastq",
              file=out2)

        print(f"/usr/bin/time -v {restrander_bin} {fastq_path}/{cb.stem}.marti_tags.fastq {restrander_path}/{cb.stem}.restrander.fastq.gz {RESTRANDER_CONFIG[MONOMER_KEYS[i][0]]} > {restrander_path}/{cb.stem}.restrander-stats.json 2> {logs_dir}/time_restrander_{cb.stem}.log",
              file=out3)

        print(f"/usr/bin/time -v {restrander_bin} {fastq_path}/{cb.stem}.marti_tags.fastq {restrander_path}/{cb.stem}.restrander_PB.fastq.gz {RESTRANDER_PB_CONFIG[MONOMER_KEYS[i][0]]} > {restrander_path}/{cb.stem}.restrander_PB-stats.json 2> {logs_dir}/time_restrander_PB_{cb.stem}.log",
              file=out4)


We now need to run the scripts generated externally.

After that, we can parse the timing logs.

In [None]:
TIME_AT_END = re.compile(r'(\d+(?::\d{2}){1,2}(?:\.\d+)?)\s*$')
FNAME_RE = re.compile(r'^time_(?P<tool>marti(?:_\d+t)?|restrander(?:_PB)?)_(?P<input>.+)\.log$')

def parse_fname(p: Path):
    m = FNAME_RE.match(p.name)
    if not m:
        raise ValueError(f"Unexpected filename: {p.name}")
    return m.group('tool'), m.group('input')

def parse_elapsed_line(line: str) -> float:
    m = TIME_AT_END.search(line)
    token = m.group(1)
    parts = token.split(":")
    s = float(parts[-1])
    m_ = int(parts[-2]) if len(parts) >= 2 else 0
    h = int(parts[-3]) if len(parts) == 3 else 0
    return h*3600 + m_*60 + s

def parse_time_log(path: Path) -> dict:
    data = {}
    with open(path) as fh:
        for raw in fh:
            line = raw.strip()
            if line.startswith("User time"):
                data["user_time"] = float(line.rsplit(":", 1)[1].strip())
            elif line.startswith("System time"):
                data["sys_time"] = float(line.rsplit(":", 1)[1].strip())
            elif line.startswith("Elapsed (wall clock) time"):
                data["real_time"] = parse_elapsed_line(line)
    return data


rows = []
for log in logs_dir.glob("time_*.log"):
    tool, input_name = parse_fname(log)
    stats = parse_time_log(log)
    if stats:
        rows.append({"tool": tool, "input": input_name, **stats})

df = pd.DataFrame(rows)
df["cpu_time"] = df["user_time"] + df["sys_time"]

# wide table
pivot = df.pivot(index="input", columns="tool", values=["cpu_time", "real_time"])
pivot.columns = [f"{metric}_{tool}" for metric, tool in pivot.columns]
pivot = pivot.reset_index()

print(pivot)

In [None]:
# Absolute times
pivot.plot.bar(x="input", y=["cpu_time_marti", "cpu_time_marti_8t", "cpu_time_restrander", "cpu_time_restrander_PB"])
plt.ylabel("CPU time (s)")
plt.title("CPU time comparison")
plt.show()

pivot.plot.bar(x="input", y=["real_time_marti", "real_time_marti_8t", "real_time_restrander", "real_time_restrander_PB"])
plt.ylabel("Real time (s)")
plt.title("Wall clock comparison")
plt.show()


In [None]:
tidy = (
    df[["input", "tool", "cpu_time", "real_time"]]
    .melt(id_vars=["input", "tool"], var_name="metric", value_name="seconds")
)

In [None]:
plot_absolute(tidy, metric="cpu_time",  title="CPU time comparison") #,  savepath="abs_cpu.svg")
plot_absolute(tidy, metric="real_time", title="Wall clock comparison") # savepath="abs_real.svg")

We will now compare the artefact classifications of both tools. For `marti` we read the "lb" tag. Because `marti` reports more categories than restrander, we will collapse those extra categories into "Other" to simplify the comparison.

For `restrander`, there are 2 outputs. One with all the "proper" reads restranded, and one with artefacts and unknowns. We will use the lower error rate version because the default error rate leads to incorrect TSO/RT matches.

We can first look at the comparison for one sample.

In [None]:
df_reads, ct = summarize_input(callao_bams[0].stem, marti_root=marti_path, restrander_root=restrander_path)
print(ct)  # counts per category pair

We can now visualize the category matches as a heatmap or an upset plot.

In [None]:
plot_heatmap(ct, normalize=None,  title="Counts") #,            savepath="heat_counts.svg")

In [None]:
warnings.filterwarnings("ignore") # pandas outputs some warning about future compatibility changes

In [None]:
plot_upset_from_ct(ct, title="UpSet: one sample")

This is however a bit messy to look at because `marti` has a number of categories that `restrander` does not, so for simplicity we can also collapse all the `marti` extra categories into an "Other" group.

In [None]:
ct_collapsed = collapse_marti_ct(ct)

In [None]:
plot_heatmap(ct_collapsed, normalize=None,  title="Counts") #,            savepath="heat_counts.svg")
plot_heatmap(ct_collapsed, normalize='row', title="Row-normalized %") #,  savepath="heat_row.svg")
plot_heatmap(ct_collapsed, normalize='col', title="Col-normalized %") #,  savepath="heat_col.svg")
plot_heatmap(ct_collapsed, normalize='all', title="Global %") #,          savepath="heat_all.svg")

In [None]:
plot_upset_from_ct(ct_collapsed, title="UpSet: one sample")

We can now get overall stats across all libraries and flowcells.

In [None]:
per_input_df, agg_ct = summarize_many(base_names=[cb.stem for cb in callao_bams], marti_root=marti_path, restrander_root=restrander_path)

Again we will collapse the extra categories here to simplify the visualization, but you can run the plotting methods with just `agg_ct` instead of `agg_ct_collapsed` just fine.

In [None]:
agg_ct_collapsed = collapse_marti_ct(agg_ct)

In [None]:
plot_heatmap(agg_ct_collapsed, normalize=None, title="All samples combined")
plot_heatmap(agg_ct_collapsed, normalize='row', title="Row-normalized %")#,  savepath="heat_row.svg")
plot_heatmap(agg_ct_collapsed, normalize='col', title="Col-normalized %")#,  savepath="heat_col.svg")
plot_heatmap(agg_ct_collapsed, normalize='all', title="Global %")#,          savepath="heat_all.svg")


In [None]:
plot_upset_from_ct(agg_ct_collapsed, title="UpSet: Marti vs Restrander all samples")

Because most of the matches are Proper-Proper, they overshadow the other pairs, so we will also generate a version of the upset plot where with exclude that pair.

In [None]:
plot_upset_from_ct(agg_ct_collapsed, title="All samples (excl. Proper-Proper)",
                   exclude_pair=("Proper","Proper"))

We can also just view the data as a table. Before or after collapsing categories.

In [None]:
agg_ct

In [None]:
agg_ct_collapsed

For a more detailled look at how the categories `marti` can identify but `restrander` does not, we can also color the upset plot bars for counts by category.

In [None]:
plot_upset_from_per_input_df(per_input_df)

# Marti unknowns investigation

While developing `marti`, besides the known and hypothetized artifacts categories, we defined new artifact categories to identify as we uncovered new recurrent structures that were previoulsy "unknown" to it. Any read not matching a defined artifact category gets assigned to the `Unk`(nown) category, but all constituting elements identified are still reported for each read. This allows us to inspect that there are no recurring artifacts left. For that we will take a look at the frequency of the different structures that are found in the `Unk` reads.

In [None]:
from mdl.sc_isoform_paper.constants import MASSEQ_KEYS

In [None]:
root_dir = Path.home()

data_path = root_dir / "data" / "masseq"

marti_path = data_path / f"{today}_marti"

This time to have more data, and because we do not need to rerun Marti as we already have the outputs, we can take a look at the larger MAS-seq runs.

In [None]:
MASSEQ_REGEX = re.compile(r"m8[^/]*\.skera\.(\d+)$")

In [None]:
def scan_runs_masseq(base_path):
    results = defaultdict(lambda: {"label": None, "files": [], "class_totals": defaultdict(int)})

    for run_path in Path(base_path).rglob("reports/class_counts.tsv"):
        run_dir = run_path.parent.parent
        folder_name = run_dir.name

        match = MASSEQ_REGEX.match(folder_name)
        if not match:
            print(f"Skipping unrecognized MASSEQ folder: {folder_name}")
            continue

        dataset_index = int(match.group(1))
        if dataset_index not in MASSEQ_KEYS:
            print(f"Unmapped MASSEQ index: {folder_name}")
            continue

        label = MASSEQ_KEYS[dataset_index]
        results[dataset_index]["label"] = label

        structure_path = run_dir / "reports" / "structure_counts.tsv"
        if not structure_path.exists():
            print(f"Missing structure_counts.tsv in {folder_name}")
            continue

        class_counts = parse_class_counts(run_path)
        structure_counts = parse_structure_counts(structure_path)

        results[dataset_index]["files"].append({
            "run_id": folder_name,
            "class_counts": class_counts,
            "structure_counts": structure_counts,
        })

        for cls, count in class_counts.items():
            results[dataset_index]["class_totals"][cls] += count

    return results


In [None]:
def plot_unk_subclasses(unk_structs, total_reads, title=None, save_path=None):
    if not unk_structs:
        print("No 'Unk' subclass entries found.")
        return

    subclasses, counts = zip(*sorted(unk_structs.items(), key=lambda x: -x[1]))
    proportions = [count / total_reads for count in counts]

    fig, ax = plt.subplots(figsize=(14, 6))
    ax.bar(range(len(subclasses)), proportions, color="#a05d56")

    ax.set_xticks(range(len(subclasses)))
    ax.set_xticklabels(subclasses, rotation=90, fontsize=8)
    ax.set_ylabel("Proportion of total reads")
    ax.set_title(title or "Unknown Structure Subclass Proportions")

    fig.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300)
    else:
        plt.show()

In [None]:
def plot_all_masseq_unk(results, output_dir, file_format="png"):
    os.makedirs(output_dir, exist_ok=True)

    for idx, entry in results.items():
        label = entry["label"]
        label_str = "_".join(part.replace(" ", "").replace("'", "") for part in label)
        filename = f"{label_str}_UnkSubclasses.{file_format}"
        save_path = os.path.join(output_dir, filename)

        total = sum(entry["class_totals"].values())
        unk_subclass_counts = defaultdict(int)

        for file in entry["files"]:
            for subclass, count in file["structure_counts"].get("Unk", {}).items():
                unk_subclass_counts[subclass] += count

        print(f"Saving {file_format.upper()}: {save_path}")
        plot_unk_subclasses(
            unk_structs=unk_subclass_counts,
            total_reads=total,
            title=f"MASSEQ {label_str} – Unk Subclasses",
            save_path=save_path
        )


We first parse the `marti` outputs.

In [None]:
masseq_results = scan_runs_masseq(marti_path)

And then plot the distribution of the different structures found.

In [None]:
masseq_unk_figures = figure_path / "masseq_unk_subclass_plots"
masseq_unk_figures.mkdir(exist_ok=True)

In [None]:
plot_all_masseq_unk(masseq_results, masseq_unk_figures, file_format="svg")

None of the structures makes it to 0.001 of the data at the most, or 0.002 if considering both orientations, so there is no good reason to define more artifact categories from this data. This could however change with if newer library construction methods produce new types of artifacts in the future.