# Barcode extraction

Once the reads have been deconcatenated (if they're MAS-seq) and classified with `marti`, we need to extract the barcode and UMI. We do this using `bouncer`, a Python+Rust package that uses the `barcode-symspell` crate to match using edit-distance. Matching is fast but there are a lot of reads so it still takes a while.

As we extract the barcode and UMI we will also extract and re-orient the cDNA to be 5' to 3', based on the adapters.

### imports

In [None]:
import pickle
from collections import defaultdict
from pathlib import Path

import numpy as np

import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor

import yaml
import pysam

from mdl.sc_isoform_paper import today
from mdl.sc_isoform_paper.constants import MASSEQ_FILENAMES, MASSEQ_KEYS, SAMPLE_COLORS

## setup

In [None]:
pysam.set_verbosity(0)

root_dir = Path.home()
sh_dir = root_dir / "sh_scripts"

data_path = root_dir / "data" / "masseq"

marti_path = data_path / "20240707_marti"
cdna_path = data_path / f"{today}_cdna"
cdna_path.mkdir(exist_ok=True)

figure_path = root_dir / "202501_figures"

### config files for barcode extraction

In [None]:
barcode_files = {
    "PIPseq": root_dir / "metadata" / "fluent_barcodes.txt.gz",
    "10x 3'": root_dir / "metadata" / "3M-february-2018.txt.gz",
    "10x 5'": root_dir / "metadata" / "737K-august-2016.txt.gz",
}

umi_sizes = {"PIPseq": 12, "10x 3'": 12, "10x 5'": 10}
buffer_sizes = {"PIPseq": 56, "10x 3'": 29, "10x 5'": 27}


In [None]:
classified_bams = sorted(marti_path.glob("*/*.classified.bam"))
tagged_bams = [cdna_path / f"{clb.name.rsplit('.', 2)[0]}.tagged.bam" for clb in classified_bams]
print("\n".join(f"{clb.stem}    {tb.stem}" for clb, tb in zip(classified_bams, tagged_bams)))

In [None]:
for i in (1, 2, 3, 4):
    k = MASSEQ_FILENAMES[i]
    key = MASSEQ_KEYS[i][0]

    config = dict(
        sample_type=key,
        barcode_file=str(barcode_files[key]),
        umi_size=umi_sizes[key],
        buffer_size=buffer_sizes[key],
        bam_paths=list(map(str, marti_path.glob(f"*/*.{i}.classified.bam"))),
    )

    with open(data_path / f"barcode_config.{k}.yaml", "w") as out:
        yaml.dump(config, out)

Then we can run each of the samples using the `extract_barcodes` command from this package:

```bash
extract_barcodes -p 8 --config-file barcode_config.pipseq_8x.yaml
```

By default this will write tagged BAM files adjacent to the classified BAMs. Use `--output-dir` to write them to a separate directory.

## Read tagged BAM files and count barcodes/UMIs

As a quick check, we'll count up barcodes and UMIs for each sample.

In [None]:
def count_barcode_umis(key, bam_file):
    barcode_umi_counter = defaultdict(set)

    with pysam.AlignmentFile(bam_file, "rb", check_sq=False, threads=8) as fh:
        for a in fh:
            cb = a.get_tag("CB")
            ub = a.get_tag("UB")
            barcode_umi_counter[cb].add(ub)

    return key, barcode_umi_counter


In [None]:
barcode_umi_counters = {i: defaultdict(set) for i in MASSEQ_KEYS}

tagged_keys = (int(tb.name.split(".")[2]) for tb in tagged_bams)

with ProcessPoolExecutor(16) as exc:
    for i, bcu in exc.map(count_barcode_umis, tagged_keys, tagged_bams):
        for barcode, u_set in bcu.items():
            barcode_umi_counters[i][barcode].update(u_set)

In [None]:
# print some stats: number of barcodes, mean UMIs/barcode, # barcodes > 1000 UMIs, total UMIs
for i, bcu in barcode_umi_counters.items():
    umis_per_bc = np.array([len(v) for v in bcu.values()])
    print(
        f"{' '.join(MASSEQ_KEYS[i]):12s}",
        len(bcu),
        (umis_per_bc > 1000).sum(),
        umis_per_bc.sum(),
        sep="\t"
    )

In [None]:
with open(data_path / "longread_barcode_umi.pickle", "wb") as out:
    pickle.dump({i: dict(d) for i,d in barcode_umi_counters.items()}, out)

In [None]:
with open(data_path / "longread_barcode_umi.pickle", "rb") as fh:
    barcode_umi_counters = pickle.load(fh)

In [None]:
# kneeplots for long read data
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

for i in (1, 2, 3, 4):
    lbl = " ".join(MASSEQ_KEYS[i])
    c = SAMPLE_COLORS[MASSEQ_KEYS[i][0]]
    linestyle = "dotted" if i == 2 else "solid"

    umis_per_bc = sorted((len(v) for v in barcode_umi_counters[i].values()), reverse=True) 

    ax.plot(umis_per_bc, label=lbl, color=c, linestyle=linestyle)

ax.set_xscale("log")
ax.set_yscale("log")
ax.set_title("UMIs per barcode")
ax.legend()

plt.savefig(figure_path / "supp_fig12_longread_kneeplots.svg")
plt.show()

These kneeplots are quite similar to the results from the short-read data, which is great. The PIPseq samples have slightly lower UMIs/cell but more barcodes are above 1000 UMIs, as before. The overall number of UMIs is lower, as we would expect from the lower depth of the long-read data.