### Comparing results from IsoQuant run on MASseq data

We first filter the output from IsoQuant to transcripts with at least 5 counts--this removes very rare transcripts which can be less reliable.

After creating filtered GTF files, we compare them with the `gffcompare` tool (v0.12.6).

```
gffcompare -r [gencode_basic_gtf] [filtered_gtfs ...] --strict-match -o [gffcompare_path]/gffcmp
```

In [None]:
import csv
import itertools
import pickle
import warnings
from concurrent.futures import ProcessPoolExecutor
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pysam

from mdl.sc_isoform_paper import today
from mdl.sc_isoform_paper.constants import MASSEQ_KEYS, MASSEQ_FILENAMES, SAMPLE_COLORS
from mdl.sc_isoform_paper.plots import plot_dists
from mdl.sc_isoform_paper.priming import PrimingClassifier
from mdl.sc_isoform_paper.util import filter_gtf

import upsetplot

In [None]:
# the upsetplot package raises some deprecation warnings from pandas, but they can
# be ignored for now
warnings.simplefilter("ignore")

pysam.set_verbosity(0)

root_dir = Path.home()
sh_dir = root_dir / "sh_scripts"

data_path = root_dir / "data" / "masseq"
figure_path = root_dir / "202501_figures"

# path to isoquant runs
isoquant_path = data_path / "20240722_isoquant"
# path to annotated BAMs
annotated_path = data_path / "20250124_annotated"

In [None]:
filtered_gtf_path = data_path / f"{today}_filtered_gtfs"
filtered_gtf_path.mkdir(exist_ok=True)

for i in (1, 3, 4):
    fn = MASSEQ_FILENAMES[i]
    filter_gtf(
        isoquant_path / fn / "OUT" /  "OUT.transcript_models.gtf",
        filtered_gtf_path / f"{fn}.gtf",
        isoquant_path / fn / "OUT" / "OUT.transcript_model_counts.tsv",
    )

gffcompare_path = data_path / f"{today}_gffcompare"
gffcompare_path.mkdir(exist_ok=True)

### GFFCompare

After filtering the IsoQuant outputs to a minimum of 5 counts, we run `GffCompare` to match the results to each other and to the reference:

```
gffcompare -r GRCh38.gencode.v39.annotation.basic.gtf [filtered_gtf_path]/{pipseq_8x,10x_3p,10x_5p}.gtf --strict-match -o [gffcompare_path]/gffcmp
```

### UpSet plots

From here we can make the plots in Figure 2: UpSet plots showing the overlaps between the different samples, and violinplot showing the UMI distribution for unique and shared transcripts.

In [None]:
cols = ("gff_id", "xloc", "gene_tx", "class")

samples = tuple(MASSEQ_KEYS[i] for i in (1, 3, 4))
samples

In [None]:
def read_gffcompare(gffcompare_tracking, cols):
    """read the tracking file and convert to a list of dictionaries for simpler access"""
    with open(gffcompare_tracking) as fh:
        rows = list(csv.DictReader(fh, fieldnames=cols, delimiter="\t"))

    return rows


def row_filter(rows, classes=None):
    """optionally filter gffcompare rows for a subset of classes"""
    if classes is not None:
        classes = set(classes)
    yield from (r for r in rows if (classes is None or r["class"] in classes))


def make_overlap_df(rows, samples, classes="="):
    sample_sets = defaultdict(list)
    for i,r in enumerate(row_filter(rows, classes)):
        for s in samples:
            if r[s] != '-':
                sample_sets[' '.join(s)].append(i)

    return upsetplot.from_contents(sample_sets)

In [None]:
rows = read_gffcompare(gffcompare_path / "gffcmp.tracking", cols + samples)
len(rows)

In [None]:
fig = plt.figure(figsize=(12, 8))
axs = upsetplot.plot(make_overlap_df(rows, samples, "=c"), sort_categories_by="input", fig=fig, totals_plot_elements=0)
fig.suptitle("GFFCompare Overlap, = and c")
plt.savefig(figure_path / "fig2e_gffcompare_match.svg")
plt.show()

In [None]:
fig = plt.figure(figsize=(12, 8))
all_overlap_df = make_overlap_df(rows, samples, None)
axs = upsetplot.plot(all_overlap_df, sort_categories_by="input", fig=fig, totals_plot_elements=0)
fig.suptitle("GFFCompare Overlap, all tx")
plt.savefig(figure_path / "fig2f_gffcompare_all.svg")
plt.show()

## Counting reads and UMIs for shared and unique transcripts

Here we collect two sets of counts: first, we break down the reads for each set of transcripts by their priming status, and compute the overall internal priming rate for unique vs shared transcripts.

Next, we count the UMIs (as determined by the combination of transcript, sequence, and UMI) for each transcript in the two sets, and plot the distributions (Fig 2f).

This takes a little while and requires the BAMs to count the UMIs. Instead we can load the results from a pickle and skip to **Results**

In [None]:
stats_file = data_path / "gffcompare_stats.pickle"
if stats_file.exists():
    with stats_file.open("rb") as fh:
        shared_read_priming, unique_read_priming, shared_counts, unique_counts = pickle.load(fh)

In [None]:
sample_sets = defaultdict(set)
sample_tx_map = defaultdict(dict)

for i, r in enumerate(row_filter(rows, None)):
    for s in samples:
        if r[s] != '-':
            sample_sets[s].add(i)
            sample_tx_map[s][i] = r[s].split("|")[1]

In [None]:
unique_i = dict()
shared_i = dict()

for s in sample_sets:
    unique_i[s] = sample_sets[s].difference(*(sample_sets[s2] for s2 in sample_sets if s != s2))
    shared_i[s] = set.intersection(*sample_sets.values())

u_sets = {s: {sample_tx_map[s][i] for i in unique_i[s]} for s in sample_sets}
s_sets = {s: {sample_tx_map[s][i] for i in shared_i[s]} for s in sample_sets}


In [None]:
good_tags = frozenset(t.name for t in PrimingClassifier.GOOD_PRIMING_TAGS)

def count_tx_priming(annotated_bam, tx_set, good_tags=good_tags):
    tx_reads = defaultdict(Counter)
    with pysam.AlignmentFile(annotated_bam, "rb", threads=2) as fh:
        for a in fh:
            if (tx := a.get_tag("YT")) in tx_set:
                tx_reads[tx][a.get_tag("XC") in good_tags] += 1

    return tx_reads

def count_tx_umis(annotated_bam, tx_set):
    tx_reads = defaultdict(set)
    with pysam.AlignmentFile(annotated_bam, "rb", threads=2) as fh:
        for a in fh:
            if (tx := a.get_tag("YT")) in tx_set:
                tx_reads[tx].add((a.query, a.get_tag("UB")))

    return tx_reads

In [None]:
shared_read_priming = defaultdict(lambda: defaultdict(Counter))
unique_read_priming = defaultdict(lambda: defaultdict(Counter))

with ProcessPoolExecutor(8) as exc:
    for i in [1, 3, 4]:
        s = MASSEQ_KEYS[i]
        anno_bams = sorted(annotated_path.glob(f"*.{i}.*annotated.bam"))
        print(" ".join(s), len(anno_bams))
        for priming_c in exc.map(count_tx_priming, anno_bams, itertools.repeat(s_sets[s])):
            for tx in priming_c:
                shared_read_priming[s][tx] += priming_c[tx]
        for priming_c in exc.map(count_tx_priming, anno_bams, itertools.repeat(u_sets[s])):
            for tx in priming_c:
                unique_read_priming[s][tx] += priming_c[tx]

shared_read_priming = {s: dict(v) for s, v in shared_read_priming.items()}
unique_read_priming = {s: dict(v) for s, v in unique_read_priming.items()}

In [None]:
shared_counts = dict()
unique_counts = dict()

with ProcessPoolExecutor(8) as exc:
    for i in [1, 3, 4]:
        s = MASSEQ_KEYS[i]
        anno_bams = sorted(annotated_path.glob(f"*.{i}.*annotated.bam"))
        print(" ".join(s), len(anno_bams))

        shared_reads = defaultdict(set)
        for rld in exc.map(count_tx_umis, anno_bams, itertools.repeat(s_sets[s])):
            for tx in rld:
                shared_reads[tx].update(rld[tx])

        shared_counts[s] = np.array([len(v) for v in shared_reads.values()])

        unique_reads = defaultdict(set)
        for rld in exc.map(count_tx_umis, anno_bams, itertools.repeat(u_sets[s])):
            for tx in rld:
                unique_reads[tx].update(rld[tx])

        unique_counts[s] = np.array([len(v) for v in unique_reads.values()])

# delete these to save memory
del shared_reads, unique_reads

In [None]:
if not stats_file.exists():
    with stats_file.open("wb") as out:
        pickle.dump((shared_read_priming, unique_read_priming, shared_counts, unique_counts), out)    

### Results

We print out some statistics and then make the plot for Figure 2g.

In [None]:
for s in samples:
    print(s[0], np.percentile(shared_counts[s], (5, 50, 95)), np.percentile(unique_counts[s], (5, 50, 95)))

In [None]:
for s in samples:
    total_shared = sum(shared_read_priming[s].values(), start=Counter())
    total_unique = sum(unique_read_priming[s].values(), start=Counter())
    print(s[0])
    print(f"\t{total_shared.total():,}\t{total_shared[True] / total_shared.total():.1%}\t\t{len(s_sets[s]):,}")
    print(f"\t{total_unique.total():,}\t{total_unique[True] / total_unique.total():.1%}\t\t{len(u_sets[s]):,}")
    print()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 4))
plot_dists(
    ax, [cs[s] for cs in (unique_counts, shared_counts) for s in samples],
    log=True,
    colors=[SAMPLE_COLORS[s[0]] for s in samples * 2],
    labels=[s[0] for s in samples * 2],
)

ax.set_yticks(
    np.arange(7), [f"$10^{i}$" for i in range(7)], minor=False
)
ax.set_yticks(
    np.log10([v*10**i for i in range(6) for v in range(2, 10)] + [2e6, 3e6]),
    minor=True
)
ax.set_ylabel("UMIs per transcript")

plt.savefig(figure_path / "fig2g_gffcompare_counts.svg")
