### Generate R10 squiggle for barcodes

In [45]:
import collections
import csv
from itertools import islice
from pathlib import Path

import numpy as np
import numpy.typing as npt
from pyfastx import Fasta

In [46]:
with open("barcoding/model_test.tsv", "r") as fh:
    reader = csv.reader(fh, delimiter="\t")
    kmer_values = {x[0]: float(x[1]) for x in reader}

In [47]:
def signalify(
    kmers: dict[str, dict[str, float]], sequence: str
) -> npt.NDArray[np.int16]:
    """convert a given sequence to signal using R10 models,
    returning np array containing 10 samples per base"""
    a = []
    # Always upper case signal
    for kmer in sliding_window(sequence.upper(), 9):
        value = kmers["".join(kmer)]
        value = (value * 2048) / 200 - 0
        for _ in range(10):
            a.append(value)
    return np.array(a).astype(np.int16)

In [48]:
barcode = (Path(".").resolve()) / "barcoding" / "fasta"
assert barcode.exists()

In [49]:
def sliding_window(iterable, n):
    # sliding_window('ABCDEFG', 4) --> ABCD BCDE CDEF DEFG
    it = iter(iterable)
    window = collections.deque(islice(it, n), maxlen=n)
    if len(window) == n:
        yield tuple(window)
    for x in it:
        window.append(x)
        yield tuple(window)

In [52]:
for fasta_file in barcode.glob("*.fasta"):
    fa = Fasta(str(fasta_file))
    for seq in fa:
        print(seq.name)
        raw_signal = signalify(kmer_values, seq.seq)
        squiggle_path = f"barcoding/R10_signal/{seq.name}_R10.squiggle.npy"
        np.save(squiggle_path, arr=raw_signal)

Barcode06_1
Barcode06_2
Barcode11_1
Barcode11_2
Barcode03_1
Barcode03_2
Barcode01_1
Barcode01_2
Barcode12_1
Barcode12_2
Barcode05_1
Barcode05_2
Barcode04_1
Barcode04_2
Barcode10_1
Barcode10_2
Barcode08_1
Barcode08_2
Barcode09_1
Barcode09_2
Barcode02_1
Barcode02_2
Barcode07_1
Barcode07_2
