# Issue 1: Reverse complementing a single byte string
Date noted: 2023-05-29
Date solved

In [32]:
import seqpro as sp
seq = sp.random_seq(15, alphabet=["A", "G", "C", "T"])
dna = sp.ALPHABETS["DNA"]
seq_bytes = sp.cast_seqs(seq)
ohe_seq = dna.bytes_to_ohe(seq_bytes)
rc_ohe_seq = dna.reverse_complement(ohe_seq, ohe_axis=1)

ValueError: Need a length axis to process an ndarray.

# Issue 2: Calculating GC content with normalization
Date noted: 2023-05-29
Date solved

In [61]:
import seqpro as sp
seqs = sp.random_seqs(10, 15, alphabet=["A", "G", "C", "T"])
ohe_seqs = sp.ohe(seqs, alphabet=sp.ALPHABETS["DNA"])
print(seqs.shape, ohe_seqs.shape)
print(sp.gc_content(seqs, normalize=False))  # works correctly
print(sp.gc_content(ohe_seqs, normalize=False, length_axis=1, alphabet=sp.ALPHABETS["DNA"], ohe_axis=2))  # works correctly
gc = sp.gc_content(seqs, normalize=True)
gc = sp.gc_content(ohe_seqs, normalize=True, length_axis=1, alphabet=sp.ALPHABETS["DNA"], ohe_axis=2)  # same error

(10,) (10, 15, 4)
[ 7  9  3 10  7  7  7  9  6 10]
[ 7  9  3 10  7  7  7  9  6 10]


UFuncTypeError: Cannot cast ufunc 'divide' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'

# Issue 3: Calculating nucleotide content with normalization
Date noted: 2023-05-29
Date solved

In [71]:
import seqpro as sp
seqs = sp.random_seqs(10, 15, alphabet=["A", "G", "C", "T"])
ohe_seqs = sp.ohe(seqs, alphabet=sp.ALPHABETS["DNA"])
print(seqs.shape, ohe_seqs.shape)
print(sp.nucleotide_content(ohe_seqs, normalize=False, length_axis=1))  # works correctly
print(sp.nucleotide_content(ohe_seqs, normalize=True, length_axis=1))  # same error as above

(10,) (10, 15, 4)
[[3 4 5 3]
 [6 4 3 2]
 [3 4 2 6]
 [3 3 4 5]
 [3 8 1 3]
 [1 5 5 4]
 [2 5 3 5]
 [3 3 3 6]
 [7 3 3 2]
 [7 2 2 4]]


UFuncTypeError: Cannot cast ufunc 'divide' output from dtype('float64') to dtype('uint64') with casting rule 'same_kind'

# Issue 4: Calculating nucleotide content without normalization gives a broadcasting error
Date noted: 2023-05-29
Date solved

In [72]:
import seqpro as sp
seqs = sp.random_seqs(10, 15, alphabet=["A", "G", "C", "T"])
ohe_seqs = sp.ohe(seqs, alphabet=sp.ALPHABETS["DNA"])
print(seqs.shape, ohe_seqs.shape)
print(sp.nucleotide_content(seqs, normalize=False, alphabet=sp.ALPHABETS["DNA"]))  # works correctly

(10,) (10, 15, 4)


ValueError: could not broadcast input array from shape (10,) into shape (10,10,15)

# Issue 5: count_kmers_seq TypeError
I don't think the logic of this function is correct. See step-by-step with print statements
Date noted: 2023-05-29
Date solved

In [74]:
import seqpro as sp
seq = sp.random_seq(15, alphabet=["A", "G", "C", "T"])
kmers = sp._analyzers.count_kmers_seq(seq, k=3)

TypeError: sequence item 0: expected a bytes-like object, int found

In [112]:
import numpy as np
import seqpro as sp
seq = sp.random_seq(100, alphabet=["A", "G", "C", "T"])
k = 3
assert len(seq) >= k, "Length of seq must be greater than that of k."
_seq = np.array([seq], "S").view("S1")
kmers = np.lib.stride_tricks.sliding_window_view(_seq, k)
print(kmers)  # this makes a lot of sense if this is just getting 
kmers = np.unique(kmers, return_counts=True)  # this is just going to count unique single bytes
kmers

[[b'A' b'A' b'G']
 [b'A' b'G' b'G']
 [b'G' b'G' b'A']
 [b'G' b'A' b'C']
 [b'A' b'C' b'C']
 [b'C' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'T']
 [b'G' b'T' b'G']
 [b'T' b'G' b'C']
 [b'G' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'C']
 [b'G' b'C' b'T']
 [b'C' b'T' b'C']
 [b'T' b'C' b'C']
 [b'C' b'C' b'T']
 [b'C' b'T' b'A']
 [b'T' b'A' b'T']
 [b'A' b'T' b'C']
 [b'T' b'C' b'G']
 [b'C' b'G' b'A']
 [b'G' b'A' b'A']
 [b'A' b'A' b'C']
 [b'A' b'C' b'A']
 [b'C' b'A' b'A']
 [b'A' b'A' b'T']
 [b'A' b'T' b'C']
 [b'T' b'C' b'C']
 [b'C' b'C' b'A']
 [b'C' b'A' b'G']
 [b'A' b'G' b'G']
 [b'G' b'G' b'C']
 [b'G' b'C' b'T']
 [b'C' b'T' b'G']
 [b'T' b'G' b'A']
 [b'G' b'A' b'C']
 [b'A' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'T']
 [b'G' b'T' b'C']
 [b'T' b'C' b'T']
 [b'C' b'T' b'A']
 [b'T' b'A' b'C']
 [b'A' b'C' b'A']
 [b'C' b'A' b'C']
 [b'A' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'G']
 [b'G' b'G' b'A']
 [b'G' b'A' b'A']
 [b'A' b'A' b'C']
 [b'A' b'C' b'G']
 [b'C' b'G' b'A']
 [b'G' b'A' b'A']
 [b'A' b'A

(array([b'A', b'C', b'G', b'T'], dtype='|S1'), array([90, 79, 78, 47]))

In [113]:
_seq = np.array([seq], "S").view("S1")
kmers = np.lib.stride_tricks.sliding_window_view(_seq, k)
print(kmers)  # this makes a lot of sense if this is just getting 
kmers = np.array([b"".join(kmer).decode("ascii") for kmer in kmers])  # if we join the kmers first I think it should work, but might be slow?
print(kmers)
kmers, counts = np.unique(kmers, return_counts=True)
kmers, counts

[[b'A' b'A' b'G']
 [b'A' b'G' b'G']
 [b'G' b'G' b'A']
 [b'G' b'A' b'C']
 [b'A' b'C' b'C']
 [b'C' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'T']
 [b'G' b'T' b'G']
 [b'T' b'G' b'C']
 [b'G' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'C']
 [b'G' b'C' b'T']
 [b'C' b'T' b'C']
 [b'T' b'C' b'C']
 [b'C' b'C' b'T']
 [b'C' b'T' b'A']
 [b'T' b'A' b'T']
 [b'A' b'T' b'C']
 [b'T' b'C' b'G']
 [b'C' b'G' b'A']
 [b'G' b'A' b'A']
 [b'A' b'A' b'C']
 [b'A' b'C' b'A']
 [b'C' b'A' b'A']
 [b'A' b'A' b'T']
 [b'A' b'T' b'C']
 [b'T' b'C' b'C']
 [b'C' b'C' b'A']
 [b'C' b'A' b'G']
 [b'A' b'G' b'G']
 [b'G' b'G' b'C']
 [b'G' b'C' b'T']
 [b'C' b'T' b'G']
 [b'T' b'G' b'A']
 [b'G' b'A' b'C']
 [b'A' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'T']
 [b'G' b'T' b'C']
 [b'T' b'C' b'T']
 [b'C' b'T' b'A']
 [b'T' b'A' b'C']
 [b'A' b'C' b'A']
 [b'C' b'A' b'C']
 [b'A' b'C' b'G']
 [b'C' b'G' b'G']
 [b'G' b'G' b'G']
 [b'G' b'G' b'A']
 [b'G' b'A' b'A']
 [b'A' b'A' b'C']
 [b'A' b'C' b'G']
 [b'C' b'G' b'A']
 [b'G' b'A' b'A']
 [b'A' b'A

(array(['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA',
        'AGG', 'ATA', 'ATC', 'ATG', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA',
        'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CTA', 'CTC', 'CTG', 'GAA',
        'GAC', 'GAG', 'GAT', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT',
        'GTC', 'GTG', 'TAC', 'TAG', 'TAT', 'TCC', 'TCG', 'TCT', 'TGA',
        'TGC'], dtype='<U3'),
 array([1, 4, 1, 2, 5, 1, 3, 1, 2, 3, 3, 4, 1, 3, 1, 1, 2, 2, 1, 1, 4, 1,
        4, 2, 3, 1, 3, 3, 2, 2, 3, 2, 3, 2, 2, 2, 1, 1, 2, 1, 2, 3, 2, 2,
        1, 2]))

# Issue 6: `k_shuffle` with ohe sequences that are padded
I think this is because the function looks along the sequence length for non-zero positions, which for padded sequences won't exist where padding has occured
Date noted: 2023-05-29
Date solved

In [116]:
import seqpro as sp
import numpy as np
short_seqs = sp.random_seqs(10, 15, alphabet=["A", "G", "C", "T"])
longer_seqs = sp.random_seqs(10, 20, alphabet=["A", "G", "C", "T"])
seqs = np.concatenate([short_seqs, longer_seqs], axis=0)
ohe_seqs = sp.ohe(seqs, alphabet=sp.ALPHABETS["DNA"])
print(seqs.shape, ohe_seqs.shape)
sp.k_shuffle(ohe_seqs, k=3, length_axis=1, alphabet=sp.ALPHABETS["DNA"])

(20,) (20, 20, 4)


ValueError: cannot reshape array of size 350 into shape (20,20)

# Issue 7: `k_shuffle` with `length_axis` = 0 for single one-hot encoded sequence
Ran into an error here when working with a single sequence of shape (L, A) and passing in False to for the ohe_axis. This is the default behavior for the k_shuffle function defined below. If I pass in length_axis as 0. This function returns an error saying that Length and OHE axis must be different. only because 0 == False returns True

In [7]:
import seqpro as sp
import numpy as np
seq = sp.random_seq(15, alphabet=["A", "G", "C", "T"])
ohe_seq = sp.ohe(seq, alphabet=sp.ALPHABETS["DNA"])
print(seq, ohe_seq.shape)
sp.k_shuffle(ohe_seq, k=3, length_axis=0, alphabet=sp.ALPHABETS["DNA"])

CACAGTTGGCTACCC (15, 4)


ValueError: Length and OHE axis must be different.

---