# Testing `eugene.preprocessing` Module 

**Authorship:**
Adam Klie, *06/23/2022*
***
**Description:**
Notebook to test the `preprocessing` module of the `eugene` package
***

In [1]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import eugene as eu
eu.__version__

Global seed set to 13
Global seed set to 13
Global seed set to 13


'0.0.0'

# Prep data

In [2]:
seq = eu.utils._random_data.random_seq(seq_len=10)
seq

'CACACCAGAC'

In [3]:
seqs = eu.utils._random_data.random_seqs(seq_num=5, seq_len=10)
seqs

array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
       'AGGATGGTTG'], dtype='<U10')

In [4]:
jagged_seqs = [eu.utils._random_data.random_seq(seq_len=10), eu.utils._random_data.random_seq(seq_len=5)]
jagged_seqs

['CTCCTTTTCC', 'TATCA']

# Test `_seq_preprocess.py` functions

## Reverse complement

In [9]:
from eugene.preprocessing import reverse_complement_seq, reverse_complement_seqs

In [10]:
rc_seq = reverse_complement_seq(seq)
seq, rc_seq, reverse_complement_seq(rc_seq)

('CACACCAGAC', 'GTCTGGTGTG', 'CACACCAGAC')

In [11]:
rc_seqs = reverse_complement_seqs(seqs)
seqs, rc_seqs, reverse_complement_seqs(rc_seqs)

(array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'),
 array(['GCCGGGGTGA', 'GAACATGTAC', 'GTTATTGCAC', 'GATCAACTGG',
        'CAACCATCCT'], dtype='<U10'),
 array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'))

In [12]:
reverse_complement_seqs(jagged_seqs)

array(['GGAAAAGGAG', 'TGATA'], dtype='<U10')

## One hot encoding

In [13]:
from eugene.preprocessing import ohe_DNA_seq, ohe_DNA_seqs, decode_DNA_seq, decode_DNA_seqs

In [14]:
ohe_seq = ohe_DNA_seq(seq)
decoded_seq = decode_DNA_seq(ohe_seq)
seq, ohe_seq, decoded_seq

('CACACCAGAC',
 array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.]]),
 'CACACCAGAC')

In [15]:
ohe_seqs = ohe_DNA_seqs(seqs)
decoded_seqs = decode_DNA_seqs(ohe_seqs)
seqs, ohe_seqs, decoded_seqs

(array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'),
 array([[[0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.]],
 
        [[0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.]],
 
        [[0., 0., 1., 0.],
         [0., 0., 0., 1.],
         [0., 0., 1., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 1., 0., 0.]],
 
        [[0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.]

In [16]:
jagged_ohe_seqs = ohe_DNA_seqs(jagged_seqs)
jagged_decoded_seqs = decode_DNA_seqs(jagged_ohe_seqs)
jagged_seqs, jagged_ohe_seqs, jagged_decoded_seqs

(['CTCCTTTTCC', 'TATCA'],
 array([[[0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [0., 1., 0., 0.]],
 
        [[0., 0., 0., 1.],
         [1., 0., 0., 0.],
         [0., 0., 0., 1.],
         [0., 1., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]]),
 ['CTCCTTTTCC', 'TATCAAAAAA'])

## Dinucleotide shuffle

In [17]:
from eugene.preprocessing import dinuc_shuffle_seq, dinuc_shuffle_seqs

In [18]:
dnt_shuf_seq = dinuc_shuffle_seq(seq, num_shufs=10)
seq, dnt_shuf_seq

('CACACCAGAC',
 ['CAGACACCAC',
  'CAGACACCAC',
  'CAGACACCAC',
  'CAGACACCAC',
  'CAGACACCAC',
  'CCACAGACAC',
  'CACCAGACAC',
  'CACACCAGAC',
  'CACCACAGAC',
  'CACCACAGAC'])

In [19]:
dnt_shuf_seqs = dinuc_shuffle_seqs(seqs)
seqs, dnt_shuf_seqs

(array(['TCACCCCGGC', 'GTACATGTTC', 'GTGCAATAAC', 'CCAGTTGATC',
        'AGGATGGTTG'], dtype='<U10'),
 array(['TCACCCCGGC', 'GTTGTACATC', 'GTGCAAATAC', 'CCAGTGATTC',
        'AGGATTGGTG'], dtype='<U10'))

## In silico mutagenesis

In [27]:
from eugene.preprocessing import perturb_seqs

In [32]:
perturbed_seqs = perturb_seqs(ohe_seqs.transpose(0,2,1))

In [34]:
ohe_seqs.transpose(0,2,1).shape

(5, 4, 10)

In [37]:
perturbed_seqs[0].shape

torch.Size([30, 4, 10])

# Preprocess a random dataset by hand

## Load

In [19]:
names, seqs, rev_seqs, targets = eu.dl.read("/cellar/users/aklie/data/eugene/random_datasets/random100seqs_66bp/random_seqs.tsv", seq_col="SEQ", name_col="NAME", target_col="ACTIVITY", rev_comp=True, return_numpy=True)
names[0], seqs[0], rev_seqs[0], targets[0]

('seq001',
 'CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG',
 'CCTTCTTCCAAGCCACAAGGGGTACACCCTTGCCCAAGTAGATGCCCTTGGTGTATGGACCGGCTG',
 0.0570990784451763)

## Reverse Complement

In [20]:
rev_seqs = eu.pp.reverse_complement_seqs(seqs)
rev_ohe_seqs = eu.pp.ohe_DNA_seqs(rev_seqs)
decoded_rev_seqs = eu.pp.decode_DNA_seqs(rev_ohe_seqs)
rev_seqs[0], seqs[0], decoded_rev_seqs[0]

('CCTTCTTCCAAGCCACAAGGGGTACACCCTTGCCCAAGTAGATGCCCTTGGTGTATGGACCGGCTG',
 'CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG',
 'CCTTCTTCCAAGCCACAAGGGGTACACCCTTGCCCAAGTAGATGCCCTTGGTGTATGGACCGGCTG')

In [21]:
rev_ohe_seqs[0][:5], rev_seqs[0][:5]

(array([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]]),
 'CCTTC')

## One-hot-encoding 

In [22]:
ohe_seq = eu.pp.ohe_DNA_seq(seqs[0])
decoded_seq = eu.pp.decode_DNA_seq(ohe_seq)
seqs[0], ohe_seq[:5], decoded_seq

('CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG',
 array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]]),
 'CAGCCGGTCCATACACCAAGGGCATCTACTTGGGCAAGGGTGTACCCCTTGTGGCTTGGAAGAAGG')

In [23]:
ohe_seqs = eu.pp.ohe_DNA_seqs(seqs)
decoded_seqs = eu.pp.decode_DNA_seqs(ohe_seqs)
seqs[0][:5], ohe_seqs[0][:5], decoded_seqs[0][:5]

('CAGCC',
 array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]]),
 'CAGCC')

## Encode Names

In [24]:
encoded_names = [eu.pp.ascii_encode(name) for name in names]
decoded_names = [eu.pp.ascii_decode(name) for name in encoded_names]
names[0], encoded_names[0], decoded_names[0]

('seq001', array([115, 101, 113,  48,  48,  49]), 'seq001')

## Dinucleotide Shuffle

In [25]:
eu.pp.dinuc_shuffle_seq(seqs[0])

'CCAGGTCCTGGTATTAGCCCAGGGTGCTCAGGGGCACCATTACAAAAACGACCTGGGCTGTTGAGG'

In [26]:
eu.pp.decode_DNA_seq(eu.pp.dinuc_shuffle_seq(ohe_seqs[0]))

'CCAAATGCGGGGCAGTGTCTGCCCCCCAGCTTTCAGGTGGTATAACACACTACTTGGGAGGAAGGG'

## Dataset Processing

In [27]:
train_seqs, test_seqs, train_targets, test_targets = eu.pp.split_train_test(ohe_seqs, targets)
len(train_seqs), len(test_seqs), len(train_targets), len(test_targets)

(80, 20, 80, 20)

# Working with sdata

In [29]:
sdata = eu.datasets.random1000()

In [30]:
eu.pp.reverse_complement_data(sdata)
eu.pp.one_hot_encode_data(sdata)
eu.pp.train_test_split_data(sdata)
sdata

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
SeqData object modified:
    seqs_annot:
        + TRAIN


SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'TARGETS', 'TRAIN'
pos_annot: PyRanges object with 1456 features

In [31]:
sdata = eu.datasets.random1000()
eu.pp.prepare_data(sdata)

  0%|          | 0/3 [00:00<?, ?it/s]

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + TRAIN


---

# Scratch