# Testing functions in `_seq_preprocessing.py`

**Authorship:**
Adam Klie, *06/23/2022*
***
**Description:**
Notebook to test the functions in `_seq_preprocessing.py`
***

In [9]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu
eu.__version__

'0.1.0'

# Prep data

In [10]:
seq = eu.utils._random_data.random_seq(seq_len=10)
seq

'CACACCAGAC'

In [11]:
seqs = eu.utils._random_data.random_seqs(seq_num=5, seq_len=10)
seqs

array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
       'AGGATGGTTG'], dtype='<U10')

In [12]:
jagged_seqs = [eu.utils._random_data.random_seq(seq_len=10), eu.utils._random_data.random_seq(seq_len=5)]
jagged_seqs

['CTCCTTTTCC', 'TATCA']

# Test `_seq_preprocess.py` functions

## Sanitize seqs

In [13]:
# TODO: add tests for the following functions

## Ascii encode seqs

In [14]:
# TODO: add tests for the following functions

## Reverse complement

In [15]:
from eugene.preprocessing import reverse_complement_seq, reverse_complement_seqs

In [16]:
rc_seq = reverse_complement_seq(seq)
seq, rc_seq, reverse_complement_seq(rc_seq)

('CACACCAGAC', 'GTCTGGTGTG', 'CACACCAGAC')

In [17]:
rc_seqs = reverse_complement_seqs(seqs)
seqs, rc_seqs, reverse_complement_seqs(rc_seqs)

Reverse complementing sequences:   0%|          | 0/5 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/5 [00:00<?, ?it/s]

(array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'),
 array(['GCCGGGGTGA', 'GAACATGTAC', 'GTTATTGCAC', 'GATCAACTGG',
        'CAACCATCCT'], dtype='<U10'),
 array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'))

In [18]:
reverse_complement_seqs(jagged_seqs)

Reverse complementing sequences:   0%|          | 0/2 [00:00<?, ?it/s]

array(['GGAAAAGGAG', 'TGATA'], dtype='<U10')

## One hot encoding

In [19]:
DNA = ["A", "C", "G", "T"]
RNA = ["A", "C", "G", "U"]
COMPLEMENT_DNA = {"A": "T", "C": "G", "G": "C", "T": "A"}
COMPLEMENT_RNA = {"A": "U", "C": "G", "G": "C", "U": "A"}

In [20]:
from eugene.preprocessing._utils import _get_vocab_dict, _get_index_dict

In [21]:
_get_vocab_dict(DNA), _get_index_dict(DNA)

({'A': 0, 'C': 1, 'G': 2, 'T': 3}, {0: 'A', 1: 'C', 2: 'G', 3: 'T'})

In [22]:
from eugene.preprocessing._utils import _tokenize, _token2one_hot, _one_hot2token, _sequencize

In [23]:
tokens = _tokenize(seq)
ohe = _token2one_hot(tokens)
decoded_tokens = _one_hot2token(ohe)
decoded_seq = _sequencize(decoded_tokens)
seq, tokens, ohe, decoded_tokens, decoded_seq

('CACACCAGAC',
 [1, 0, 1, 0, 1, 1, 0, 2, 0, 1],
 array([[0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0]], dtype=int8),
 array([1, 0, 1, 0, 1, 1, 0, 2, 0, 1]),
 'CACACCAGAC')

In [24]:
from eugene.preprocessing._utils import _pad_sequences 

In [25]:
_pad_sequences(jagged_seqs, align="end", value="N"), _pad_sequences(jagged_seqs, align="start", value="$"), _pad_sequences(jagged_seqs, align="center", value="0")

(['CTCCTTTTCC', 'NNNNNTATCA'],
 ['CTCCTTTTCC', 'TATCA$$$$$'],
 ['CTCCTTTTCC', '000TATCA00'])

In [26]:
padded_seqs = _pad_sequences(jagged_seqs, align="end", value="N")
tokens = _tokenize(padded_seqs[1])
ohe = _token2one_hot(tokens, fill_value=0.25)
decoded_tokens = _one_hot2token(ohe)
decoded_seq = _sequencize(decoded_tokens)
padded_seqs[1], tokens, ohe, decoded_tokens, decoded_seq

('NNNNNTATCA',
 [-1, -1, -1, -1, -1, 3, 0, 3, 1, 0],
 array([[0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25],
        [0.25, 0.25, 0.25, 0.25],
        [0.  , 0.  , 0.  , 1.  ],
        [1.  , 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , 1.  ],
        [0.  , 1.  , 0.  , 0.  ],
        [1.  , 0.  , 0.  , 0.  ]], dtype=float16),
 array([-1, -1, -1, -1, -1,  3,  0,  3,  1,  0]),
 'NNNNNTATCA')

In [27]:
from eugene.preprocessing import ohe_seq, ohe_seqs, decode_seq, decode_seqs

In [28]:
ohe = ohe_seq(seq)
decoded_seq = decode_seq(ohe)
seq, ohe, decoded_seq

('CACACCAGAC',
 array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.]], dtype=float16),
 'CACACCAGAC')

In [29]:
ohes = ohe_seqs(seqs)
decoded_seqs = decode_seqs(ohes)
seqs, ohes, decoded_seqs

One-hot encoding sequences:   0%|          | 0/5 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/5 [00:00<?, ?it/s]

(array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'),
 array([[[0, 0, 0, 1],
         [0, 1, 0, 0],
         [1, 0, 0, 0],
         [0, 1, 0, 0],
         [0, 1, 0, 0],
         [0, 1, 0, 0],
         [0, 1, 0, 0],
         [0, 0, 1, 0],
         [0, 0, 1, 0],
         [0, 1, 0, 0]],
 
        [[0, 0, 1, 0],
         [0, 0, 0, 1],
         [1, 0, 0, 0],
         [0, 1, 0, 0],
         [1, 0, 0, 0],
         [0, 0, 0, 1],
         [0, 0, 1, 0],
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         [0, 1, 0, 0]],
 
        [[0, 0, 1, 0],
         [0, 0, 0, 1],
         [0, 0, 1, 0],
         [0, 1, 0, 0],
         [1, 0, 0, 0],
         [1, 0, 0, 0],
         [0, 0, 0, 1],
         [1, 0, 0, 0],
         [1, 0, 0, 0],
         [0, 1, 0, 0]],
 
        [[0, 1, 0, 0],
         [0, 1, 0, 0],
         [1, 0, 0, 0],
         [0, 0, 1, 0],
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         [0, 0, 1, 0],
         [1, 0, 0, 0],
         [0, 0,

In [30]:
jagged_ohe_seqs = ohe_seqs(jagged_seqs)
jagged_decoded_seqs = decode_seqs(jagged_ohe_seqs)
jagged_seqs, jagged_ohe_seqs, jagged_decoded_seqs

One-hot encoding sequences:   0%|          | 0/2 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/2 [00:00<?, ?it/s]

(['CTCCTTTTCC', 'TATCA'],
 array([[[0, 1, 0, 0],
         [0, 0, 0, 1],
         [0, 1, 0, 0],
         [0, 1, 0, 0],
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         [0, 0, 0, 1],
         [0, 1, 0, 0],
         [0, 1, 0, 0]],
 
        [[0, 0, 0, 1],
         [1, 0, 0, 0],
         [0, 0, 0, 1],
         [0, 1, 0, 0],
         [1, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]], dtype=int8),
 array(['CTCCTTTTCC', 'TATCANNNNN'], dtype='<U10'))

## Dinucleotide shuffle

In [31]:
from eugene.preprocessing import dinuc_shuffle_seq, dinuc_shuffle_seqs

In [32]:
dnt_shuf_seq = dinuc_shuffle_seq(seq, num_shufs=10)
seq, dnt_shuf_seq

('CACACCAGAC',
 ['CAGACACCAC',
  'CCACAGACAC',
  'CACAGACCAC',
  'CACCACAGAC',
  'CACCAGACAC',
  'CCAGACACAC',
  'CACACCAGAC',
  'CACCAGACAC',
  'CACCACAGAC',
  'CACCAGACAC'])

In [33]:
dnt_shuf_seqs = dinuc_shuffle_seqs(seqs)
seqs, dnt_shuf_seqs

(array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'),
 array(['TCCACCCGGC', 'GTGTACATTC', 'GTGCAAATAC', 'CCAGTTGATC',
        'AGATTGGGTG'], dtype='<U10'))

## In silico mutagenesis

In [34]:
from eugene.preprocessing import perturb_seqs

In [64]:
perturbed_seqs = perturb_seqs(ohes)

torch.Size([150, 4, 10])
torch.Size([5, 30, 4, 10])


  "X_0 has {} choices, but should have 4. Transposing".format(X_0.shape[1]),


In [73]:
ohes.shape

(5, 10, 4)

In [71]:
seqs[0]

'TCACCCCGGC'

In [70]:
decode_seqs(perturbed_seqs[0].numpy().transpose(0, 2, 1))

Decoding sequences:   0%|          | 0/30 [00:00<?, ?it/s]

array(['ACACCCCGGC', 'CCACCCCGGC', 'GCACCCCGGC', 'TGACCCCGGC',
       'TTACCCCGGC', 'TAACCCCGGC', 'TCCCCCCGGC', 'TCGCCCCGGC',
       'TCTCCCCGGC', 'TCAGCCCGGC', 'TCATCCCGGC', 'TCAACCCGGC',
       'TCACGCCGGC', 'TCACTCCGGC', 'TCACACCGGC', 'TCACCGCGGC',
       'TCACCTCGGC', 'TCACCACGGC', 'TCACCCGGGC', 'TCACCCTGGC',
       'TCACCCAGGC', 'TCACCCCTGC', 'TCACCCCAGC', 'TCACCCCCGC',
       'TCACCCCGTC', 'TCACCCCGAC', 'TCACCCCGCC', 'TCACCCCGGG',
       'TCACCCCGGT', 'TCACCCCGGA'], dtype='<U10')

In [63]:
perturbed_seqs[0].shape

torch.Size([36, 10, 4])

In [54]:
import torch
import numpy as np
def perturb_seq(X_0):
    if not isinstance(X_0, np.ndarray):
        raise ValueError("X_0 must be of type np.ndarray, not {}".format(type(X_0)))

    if len(X_0.shape) != 2:
        raise ValueError(
            "X_0 must have three dimensions: (n_seqs, n_choices, seq_len)."
        )
    n_choices, seq_len = X_0.shape
    idxs = X_0.argmax(axis=1)
    X_0 = torch.from_numpy(X_0)

    n = seq_len * (n_choices - 1)
    X = torch.tile(X_0, (n, 1, 1))
    X = X.reshape(n, n_choices, seq_len).permute(1, 0, 2, 3)

    for k in range(1, n_choices):
        idx = np.arange(seq_len) * (n_choices - 1) + (k - 1)

        X[i, idx, idxs[i], np.arange(seq_len)] = 0
        X[i, idx, (idxs[i] + k) % n_choices, np.arange(seq_len)] = 1

    return X

In [57]:
ohes.shape

(5, 10, 4)

In [55]:
perturb_seq(ohes[0]).shape

torch.Size([36, 10, 4])

In [51]:
ohes[0]

array([[0, 0, 0, 1],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0]], dtype=int8)

In [41]:
perturbed_seqs.shape

torch.Size([5, 30, 4, 10])

# Preprocess a random dataset by hand

## Load

In [19]:
names, seqs, rev_seqs, targets = eu.dl.read("/cellar/users/aklie/data/eugene/random_datasets/random100seqs_66bp/random_seqs.tsv", seq_col="SEQ", name_col="NAME", target_col="ACTIVITY", rev_comp=True, return_numpy=True)
names[0], seqs[0], rev_seqs[0], targets[0]

('seq001',
 'CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG',
 'CCTTCTTCCAAGCCACAAGGGGTACACCCTTGCCCAAGTAGATGCCCTTGGTGTATGGACCGGCTG',
 0.0570990784451763)

## Reverse Complement

In [20]:
rev_seqs = eu.pp.reverse_complement_seqs(seqs)
rev_ohe_seqs = eu.pp.ohe_DNA_seqs(rev_seqs)
decoded_rev_seqs = eu.pp.decode_DNA_seqs(rev_ohe_seqs)
rev_seqs[0], seqs[0], decoded_rev_seqs[0]

('CCTTCTTCCAAGCCACAAGGGGTACACCCTTGCCCAAGTAGATGCCCTTGGTGTATGGACCGGCTG',
 'CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG',
 'CCTTCTTCCAAGCCACAAGGGGTACACCCTTGCCCAAGTAGATGCCCTTGGTGTATGGACCGGCTG')

In [21]:
rev_ohe_seqs[0][:5], rev_seqs[0][:5]

(array([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]]),
 'CCTTC')

## One-hot-encoding 

In [22]:
ohe_seq = eu.pp.ohe_DNA_seq(seqs[0])
decoded_seq = eu.pp.decode_DNA_seq(ohe_seq)
seqs[0], ohe_seq[:5], decoded_seq

('CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG',
 array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]]),
 'CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG')

In [23]:
ohe_seqs = eu.pp.ohe_DNA_seqs(seqs)
decoded_seqs = eu.pp.decode_DNA_seqs(ohe_seqs)
seqs[0][:5], ohe_seqs[0][:5], decoded_seqs[0][:5]

('CAGCC',
 array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]]),
 'CAGCC')

## Encode Names

In [24]:
encoded_names = [eu.pp.ascii_encode(name) for name in names]
decoded_names = [eu.pp.ascii_decode(name) for name in encoded_names]
names[0], encoded_names[0], decoded_names[0]

('seq001', array([115, 101, 113,  48,  48,  49]), 'seq001')

## Dinucleotide Shuffle

In [25]:
eu.pp.dinuc_shuffle_seq(seqs[0])

'CCAGGTCCTGGTATTAGCCCAGGGTGCTCAGGGGCACCATTACAAAAACGACCTGGGCTGTTGAGG'

In [26]:
eu.pp.decode_DNA_seq(eu.pp.dinuc_shuffle_seq(ohe_seqs[0]))

'CCAAATGCGGGGCAGTGTCTGCCCCCCAGCTTTCAGGTGGTATAACACACTACTTGGGAGGAAGGG'

## Dataset Processing

In [27]:
train_seqs, test_seqs, train_targets, test_targets = eu.pp.split_train_test(ohe_seqs, targets)
len(train_seqs), len(test_seqs), len(train_targets), len(test_targets)

(80, 20, 80, 20)

# Working with sdata

In [29]:
sdata = eu.datasets.random1000()

In [30]:
eu.pp.reverse_complement_data(sdata)
eu.pp.one_hot_encode_data(sdata)
eu.pp.train_test_split_data(sdata)
sdata

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
SeqData object modified:
    seqs_annot:
        + TRAIN


SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'TARGETS', 'TRAIN'
pos_annot: PyRanges object with 1456 features

In [31]:
sdata = eu.datasets.random1000()
eu.pp.prepare_data(sdata)

  0%|          | 0/3 [00:00<?, ?it/s]

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + TRAIN


---

# Scratch