In [69]:
%load_ext autoreload
%autoreload 2
%aimport seqpro

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import seqpro as sp
import numpy as np

# `alphabets` module

In [90]:
# Currently implemented alphabets
sp.ALPHABETS

{'DNA': <seqpro._alphabets.NucleotideAlphabet at 0x15552a9f4c70>,
 'RNA': <seqpro._alphabets.NucleotideAlphabet at 0x155517661be0>,
 'AA': <seqpro._alphabets.AminoAlphabet at 0x155517414e50>}

In [92]:
# Grab each available alphabet
dna = sp.ALPHABETS["DNA"]
rna = sp.ALPHABETS["RNA"]
aa = sp.ALPHABETS["AA"]

In [95]:
# Look at the alphabet properties
for alphabet in [dna, rna]:
    print(len(alphabet))
    print(alphabet.alphabet)
    print(alphabet.complement)
    print(alphabet.array)
    print(alphabet.complement_map)
    print(alphabet.complement_map_bytes)
    print(alphabet.str_comp_table)
    print(alphabet.bytes_comp_table)

4
ACGT
TGCA
[b'A' b'C' b'G' b'T']
{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
{b'A': b'T', b'C': b'G', b'G': b'C', b'T': b'A'}
{65: 'T', 67: 'G', 71: 'C', 84: 'A'}
b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff'
4
ACGU
UGCA
[b'A' b'C' b'G' b'U']
{'A': 'U', 'C': 'G', 'G': 'C', 'U': 'A'}
{b'A': b'U', b'C': b'G', 

In [105]:
# Take a look at all you can do with the alphabet
seq = sp.random_seq(15, alphabet=["A", "G", "C", "T"])
seq_bytes = sp.cast_seqs(seq)
seq_bstring = seq.encode()
ohe_seq = dna.bytes_to_ohe(seq_bytes)
decoded_seq = dna.ohe_to_bytes(ohe_seq)
decoded_seq_bstring = decoded_seq.tobytes()
complement_seq = dna.complement_bytes(seq_bytes)
rc_comp_seq = dna.rev_comp_byte(seq_bytes, length_axis=-1)
rc_comp_seq_string = dna.rev_comp_string(seq)
rc_comp_seq_bstring = dna.rev_comp_bstring(seq_bstring)
print(f"Sequence: {seq}")
print(f"Sequence as byte array: {seq_bytes}")
print(f"Sequence as byte string: {seq_bstring}")
print(f"Sequence as one-hot encoded array: {ohe_seq}")
print(f"Sequence as decoded byte array: {decoded_seq}")
print(f"Sequence as decoded byte string: {decoded_seq_bstring}")
print(f"Sequence as complement byte array: {complement_seq}")
print(f"Sequence as reverse complement byte array: {rc_comp_seq}")
print(f"Sequence as reverse complement string: {rc_comp_seq_string}")
print(f"Sequence as reverse complement byte string: {rc_comp_seq_bstring}")

Sequence: TTCAGAGGGTTCGTA
Sequence as byte array: [b'T' b'T' b'C' b'A' b'G' b'A' b'G' b'G' b'G' b'T' b'T' b'C' b'G' b'T'
 b'A']
Sequence as byte string: b'TTCAGAGGGTTCGTA'
Sequence as one-hot encoded array: [[0 0 0 1]
 [0 0 0 1]
 [0 1 0 0]
 [1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]]
Sequence as decoded byte array: [b'T' b'T' b'C' b'A' b'G' b'A' b'G' b'G' b'G' b'T' b'T' b'C' b'G' b'T'
 b'A']
Sequence as decoded byte string: b'TTCAGAGGGTTCGTA'
Sequence as complement byte array: [b'A' b'A' b'G' b'T' b'C' b'T' b'C' b'C' b'C' b'A' b'A' b'G' b'C' b'A'
 b'T']
Sequence as reverse complement byte array: [b'T' b'A' b'C' b'G' b'A' b'A' b'C' b'C' b'C' b'T' b'C' b'T' b'G' b'A'
 b'A']
Sequence as reverse complement string: TACGAACCCTCTGAA
Sequence as reverse complement byte string: b'TACGAACCCTCTGAA'


In [108]:
# Test the reverse complement function
rc_seq = dna.reverse_complement(seq)
rc_ohe_seq = dna.reverse_complement(ohe_seq, length_axis=0, ohe_axis=1)
rc_rc_ohe_seq = dna.reverse_complement(rc_ohe_seq, length_axis=0, ohe_axis=1)
np.all(rc_rc_ohe_seq == ohe_seq)

array([b'T', b'A', b'C', b'G', b'A', b'A', b'C', b'C', b'C', b'T', b'C',
       b'T', b'G', b'A', b'A'], dtype='|S1')

# `utils` module

In [29]:
# Generate some random sequences 
test_seqs = sp.random_seqs(10, 15, alphabet=["A", "g", "C", "T"])
test_seq = test_seqs[0]

# Add seqs with only Ns
test_seqs_with_only_N = sp.random_seqs(10, 15, alphabet=["N"])

# Add seqs with Ns 
test_seqs_with_N = sp.random_seqs(10, 15, alphabet=["A", "G", "C", "T", "N"])

# Append some sequences with Ns
all_test_seqs = np.concatenate((test_seqs, test_seqs_with_N, test_seqs_with_only_N))

len(test_seqs), len(test_seqs_with_N), len(test_seqs_with_only_N), len(all_test_seqs)

(10, 10, 10, 30)

In [32]:
# Cast sequences
casted_test_seq = sp.cast_seqs(test_seq)
casted_test_seqs = sp.cast_seqs(test_seqs)
casted_test_seqs_with_N = sp.cast_seqs(test_seqs_with_N)
casted_test_seqs_with_only_N = sp.cast_seqs(test_seqs_with_only_N)
casted_all_test_seqs = sp.cast_seqs(all_test_seqs)
casted_test_seq.shape, casted_test_seqs.shape, casted_test_seqs_with_N.shape, casted_test_seqs_with_only_N.shape, casted_all_test_seqs.shape

((15,), (10, 15), (10, 15), (10, 15), (30, 15))

# `cleaners` module

In [20]:
# remove seqs with only Ns
all_test_seqs_no_N = sp.remove_only_N_seqs(all_test_seqs)

# Check that the sequences with only Ns have been removed
assert len(all_test_seqs_no_N) == 20

In [24]:
# Remove seqs with any Ns
all_test_seqs_removed_N = sp.remove_N_seqs(all_test_seqs)

# Check that the sequences with Ns have been removed
assert len(all_test_seqs_removed_N) < 20

In [28]:
# Sanitize sequences
test_seq_sanitized = sp.sanitize_seq(test_seq)
all_test_seqs_removed_N_sanitized = sp.sanitize_seqs(all_test_seqs_removed_N)

# Check that the sequences have been sanitized, every letter is A, G, C, or T
assert all([x in ["A", "G", "C", "T"] for x in test_seq_sanitized])
assert all([all([x in ["A", "G", "C", "T"] for x in seq]) for seq in all_test_seqs_removed_N_sanitized])

# `encoders` module

In [6]:
# Ascii encode sequences
ascii_seq = seqpro.ascii_encode_seq(test_seq)
ascii_seqs = seqpro.ascii_encode_seqs(test_seqs)

# Ascii decode sequences
decoded_seq = seqpro.ascii_decode_seq(ascii_seq)
decoded_seqs = seqpro.ascii_decode_seqs(ascii_seqs)

# Check that the decoded sequences are the same as the original sequences
assert np.all(test_seqs == decoded_seqs)

In [7]:
# One-hot encode sequences
ohe_seq = seqpro.ohe_seq(test_seq)
ohe_seqs = seqpro.ohe_seqs(test_seqs)

# Decode sequences
decoded_seq = seqpro.decode_seq(ohe_seq)
decoded_seqs = seqpro.decode_seqs(ohe_seqs)

# Check that the decoded sequences are the same as the original sequences
assert np.all(test_seqs == decoded_seqs)

One-hot encoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

# `modifiers` module

In [13]:
# Reverse complement sequences
revcomp_seq = seqpro.reverse_complement_seq(test_seq)
revcomp_seqs = seqpro.reverse_complement_seqs(test_seqs)

# Reverse complement sequences back to original sequences
revcomp_revcomp_seq = seqpro.reverse_complement_seq(revcomp_seq)
revcomp_revcomp_seqs = seqpro.reverse_complement_seqs(revcomp_seqs)

# Check that the reverse complemented sequences are the same as the original sequences
assert np.all(test_seqs == revcomp_revcomp_seqs)

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
# Shuffle sequences
shuffled_seq = seqpro.shuffle_seq(test_seq)
shuffled_seqs = seqpro.shuffle_seqs(test_seqs)

# Check that the shuffled sequences are not the same as the original sequences
assert not np.all(test_seqs == shuffled_seqs)

In [17]:
# Dinucleotide shuffle sequences
dinuc_shuffled_seq = seqpro.dinuc_shuffle_seq(test_seq)
dinuc_shuffled_seqs = seqpro.dinuc_shuffle_seqs(test_seqs)

# Check that the dinucleotide shuffled sequences are not the same as the original sequences
assert not np.all(test_seqs == dinuc_shuffled_seqs)

# `analyzers` module

In [22]:
# Get length of sequences
lens = seqpro.len_seqs(test_seqs)

# Check that the length of the sequences is 1000
assert np.all(lens == 1000)

In [24]:
# Get GC content of sequences
gc_content = seqpro.gc_content_seq(test_seqs[0])

# Check that is the GC content of the sequences is between 0 and 1
assert np.all((gc_content >= 0) & (gc_content <= 1))

# Check that is the GC content is within 0.1 of 0.5
assert np.all(np.abs(gc_content - 0.5) < 0.1)

# Get GC content of sequences
gc_contents = seqpro.gc_content_seqs(test_seqs)

# Check that is the GC content of the sequences is between 0 and 1
assert np.all((gc_contents >= 0) & (gc_contents <= 1))

# Check that is the GC content is within 0.1 of 0.5
assert np.all(np.abs(gc_contents - 0.5) < 0.1)

In [31]:
# Get nucleotide content of sequence
nuc_content = seqpro.nucleotide_content_seq(test_seqs[0])

# Check that is the nucleotide content of the sequences is between 0 and 1
assert np.all((nuc_content >= 0) & (nuc_content <= 1))

# Check that is the nucleotide content is within 0.1 of 0.25
assert np.all(np.abs(nuc_content - 0.25) < 0.1)

# Get nucleotide content of sequences
nuc_contents = seqpro.nucleotide_content_seqs(test_seqs)

# Check that is the nucleotide content of the sequences is between 0 and 1
assert np.all((nuc_contents >= 0) & (nuc_contents <= 1))

# Check that is the nucleotide content is within 0.1 of 0.25
assert np.all(np.abs(nuc_contents - 0.25) < 0.1)

In [36]:
# Get kmer counts of sequences
kmer_counts = seqpro.count_kmers_seq(test_seqs[0], 10)

# `experimental` module