In [69]:
%load_ext autoreload
%autoreload 2
%aimport seqpro

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
import numpy as np

# `utils` module

In [71]:
# Generate some random sequences 
test_seqs = seqpro.random_seqs(10, 1000, alphabet=["A", "g", "C", "T"])
test_seq = test_seqs[0]

In [72]:
# Add seqs with only Ns
test_seqs_with_only_N = seqpro.random_seqs(10, 1000, alphabet=["N"])

# Add seqs with Ns 
test_seqs_with_N = seqpro.random_seqs(10, 1000, alphabet=["A", "G", "C", "T", "N"])

# Append some sequences with Ns
test_seqs = np.concatenate((test_seqs, test_seqs_with_N, test_seqs_with_only_N))

# `cleaners` module

In [60]:
# remove seqs with only Ns
test_seqs = seqpro.remove_only_N_seqs(test_seqs)

# Check that the sequences with only Ns have been removed
assert len(test_seqs) == 20

In [62]:
# Remove seqs with any Ns
test_seqs = seqpro.remove_N_seqs(test_seqs)

# Check that the sequences with Ns have been removed
assert len(test_seqs) == 10

In [68]:
# Sanitize sequences
test_seq = seqpro.sanitize_seq(test_seq)
test_seqs = seqpro.sanitize_seqs(test_seqs)

# Check that the sequences have been sanitized, every letter is A, G, C, or T
assert all([x in ["A", "G", "C", "T"] for x in test_seq])
assert all([all([x in ["A", "G", "C", "T"] for x in seq]) for seq in test_seqs])

# `encoders` module

In [6]:
# Ascii encode sequences
ascii_seq = seqpro.ascii_encode_seq(test_seq)
ascii_seqs = seqpro.ascii_encode_seqs(test_seqs)

# Ascii decode sequences
decoded_seq = seqpro.ascii_decode_seq(ascii_seq)
decoded_seqs = seqpro.ascii_decode_seqs(ascii_seqs)

# Check that the decoded sequences are the same as the original sequences
assert np.all(test_seqs == decoded_seqs)

In [7]:
# One-hot encode sequences
ohe_seq = seqpro.ohe_seq(test_seq)
ohe_seqs = seqpro.ohe_seqs(test_seqs)

# Decode sequences
decoded_seq = seqpro.decode_seq(ohe_seq)
decoded_seqs = seqpro.decode_seqs(ohe_seqs)

# Check that the decoded sequences are the same as the original sequences
assert np.all(test_seqs == decoded_seqs)

One-hot encoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

# `modifiers` module

In [13]:
# Reverse complement sequences
revcomp_seq = seqpro.reverse_complement_seq(test_seq)
revcomp_seqs = seqpro.reverse_complement_seqs(test_seqs)

# Reverse complement sequences back to original sequences
revcomp_revcomp_seq = seqpro.reverse_complement_seq(revcomp_seq)
revcomp_revcomp_seqs = seqpro.reverse_complement_seqs(revcomp_seqs)

# Check that the reverse complemented sequences are the same as the original sequences
assert np.all(test_seqs == revcomp_revcomp_seqs)

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
# Shuffle sequences
shuffled_seq = seqpro.shuffle_seq(test_seq)
shuffled_seqs = seqpro.shuffle_seqs(test_seqs)

# Check that the shuffled sequences are not the same as the original sequences
assert not np.all(test_seqs == shuffled_seqs)

In [17]:
# Dinucleotide shuffle sequences
dinuc_shuffled_seq = seqpro.dinuc_shuffle_seq(test_seq)
dinuc_shuffled_seqs = seqpro.dinuc_shuffle_seqs(test_seqs)

# Check that the dinucleotide shuffled sequences are not the same as the original sequences
assert not np.all(test_seqs == dinuc_shuffled_seqs)

# `analyzers` module

In [22]:
# Get length of sequences
lens = seqpro.len_seqs(test_seqs)

# Check that the length of the sequences is 1000
assert np.all(lens == 1000)

In [24]:
# Get GC content of sequences
gc_content = seqpro.gc_content_seq(test_seqs[0])

# Check that is the GC content of the sequences is between 0 and 1
assert np.all((gc_content >= 0) & (gc_content <= 1))

# Check that is the GC content is within 0.1 of 0.5
assert np.all(np.abs(gc_content - 0.5) < 0.1)

# Get GC content of sequences
gc_contents = seqpro.gc_content_seqs(test_seqs)

# Check that is the GC content of the sequences is between 0 and 1
assert np.all((gc_contents >= 0) & (gc_contents <= 1))

# Check that is the GC content is within 0.1 of 0.5
assert np.all(np.abs(gc_contents - 0.5) < 0.1)

In [31]:
# Get nucleotide content of sequence
nuc_content = seqpro.nucleotide_content_seq(test_seqs[0])

# Check that is the nucleotide content of the sequences is between 0 and 1
assert np.all((nuc_content >= 0) & (nuc_content <= 1))

# Check that is the nucleotide content is within 0.1 of 0.25
assert np.all(np.abs(nuc_content - 0.25) < 0.1)

# Get nucleotide content of sequences
nuc_contents = seqpro.nucleotide_content_seqs(test_seqs)

# Check that is the nucleotide content of the sequences is between 0 and 1
assert np.all((nuc_contents >= 0) & (nuc_contents <= 1))

# Check that is the nucleotide content is within 0.1 of 0.25
assert np.all(np.abs(nuc_contents - 0.25) < 0.1)

In [36]:
# Get kmer counts of sequences
kmer_counts = seqpro.count_kmers_seq(test_seqs[0], 10)

# `experimental` module