# Testing the `preprocess` module

**Authorship:**
Adam Klie, *10/04/2022*
***
**Description:**
Notebook for testing out the `preprocess` module.

In [7]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import torch
import numpy as np
import pandas as pd
import eugene as eu

2022-10-09 13:57:50.445752: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Sequence preprocessing

## Prep data

In [8]:
seq = eu.utils.random_seq(seq_len=10)
seqs = eu.utils.random_seqs(seq_num=10, seq_len=10)
jagged_seqs = [eu.utils.random_seq(seq_len=10), eu.utils.random_seq(seq_len=5)]
bad_seqs = ["AggaAATC ", " GGTAa"]

## Sanitize seqs

In [9]:
def test_sanitize_seqs(bad_seqs):
    assert eu.pp.sanitize_seq(bad_seqs[0]) == 'AGGAAATC' 
    assert np.all(eu.pp.sanitize_seqs(bad_seqs) == np.array(['AGGAAATC', 'GGTAA']))
test_sanitize_seqs(bad_seqs)

## Ascii encode seqs

In [10]:
def test_ascii_seqs():
    encoded_seq = eu.pp.ascii_encode_seq(seq)
    assert eu.pp.ascii_decode_seq(encoded_seq) == seq
    encoded_seqs = eu.pp.ascii_encode_seqs(seqs)
    assert np.all(eu.pp.ascii_decode_seqs(encoded_seqs) == seqs)
test_ascii_seqs()

## Reverse complement

In [14]:
def test_reverse_complement_seqs():
    assert seq == eu.pp.reverse_complement_seq(eu.pp.reverse_complement_seq(seq))
    assert np.all(seqs == eu.pp.reverse_complement_seqs(eu.pp.reverse_complement_seqs(seqs)))
    assert np.all(jagged_seqs == eu.pp.reverse_complement_seqs(eu.pp.reverse_complement_seqs(jagged_seqs)))
test_reverse_complement_seqs()

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/2 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/2 [00:00<?, ?it/s]

## One hot encoding

In [15]:
DNA = ["A", "C", "G", "T"]
RNA = ["A", "C", "G", "U"]
COMPLEMENT_DNA = {"A": "T", "C": "G", "G": "C", "T": "A"}
COMPLEMENT_RNA = {"A": "U", "C": "G", "G": "C", "U": "A"}

In [16]:
from eugene.preprocess._utils import _get_vocab_dict, _get_index_dict

In [17]:
_get_vocab_dict(DNA), _get_index_dict(DNA)

({'A': 0, 'C': 1, 'G': 2, 'T': 3}, {0: 'A', 1: 'C', 2: 'G', 3: 'T'})

In [18]:
from eugene.preprocess._utils import _tokenize, _token2one_hot, _one_hot2token, _sequencize

In [19]:
tokens = _tokenize(seq)
ohe = _token2one_hot(tokens)
decoded_tokens = _one_hot2token(ohe)
decoded_seq = _sequencize(decoded_tokens)
seq, tokens, ohe, decoded_tokens, decoded_seq

('CACACCAGAC',
 [1, 0, 1, 0, 1, 1, 0, 2, 0, 1],
 array([[0, 1, 0, 1, 0, 0, 1, 0, 1, 0],
        [1, 0, 1, 0, 1, 1, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8),
 array([1, 0, 1, 0, 1, 1, 0, 2, 0, 1]),
 'CACACCAGAC')

In [20]:
from eugene.preprocess._utils import _pad_sequences 

In [21]:
_pad_sequences(jagged_seqs, align="end", value="N"), _pad_sequences(jagged_seqs, align="start", value="$"), _pad_sequences(jagged_seqs, align="center", value="0")

(['CTCCTTTTCC', 'NNNNNTATCA'],
 ['CTCCTTTTCC', 'TATCA$$$$$'],
 ['CTCCTTTTCC', '000TATCA00'])

In [22]:
padded_seqs = _pad_sequences(jagged_seqs, align="end", value="N")
tokens = _tokenize(padded_seqs[1])
ohe = _token2one_hot(tokens, fill_value=0.25)
decoded_tokens = _one_hot2token(ohe)
decoded_seq = _sequencize(decoded_tokens)
padded_seqs[1], tokens, ohe, decoded_tokens, decoded_seq

('NNNNNTATCA',
 [-1, -1, -1, -1, -1, 3, 0, 3, 1, 0],
 array([[0.25, 0.25, 0.25, 0.25, 0.25, 0.  , 1.  , 0.  , 0.  , 1.  ],
        [0.25, 0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 1.  , 0.  ],
        [0.25, 0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  , 0.  ],
        [0.25, 0.25, 0.25, 0.25, 0.25, 1.  , 0.  , 1.  , 0.  , 0.  ]],
       dtype=float16),
 array([-1, -1, -1, -1, -1,  3,  0,  3,  1,  0]),
 'NNNNNTATCA')

In [18]:
def test_ohe_seqs():
    from eugene.preprocess import ohe_seq, ohe_seqs, decode_seq, decode_seqs
    ohe = ohe_seq(seq)
    decoded_seq = decode_seq(ohe)
    assert seq == decoded_seq

    ohe = ohe_seqs(seqs)
    decoded_seqs = decode_seqs(ohe)
    assert np.all(seqs == decoded_seqs)

    rc_ohe = eu.pp.reverse_complement_seqs(ohe)
    rcs = eu.pp.reverse_complement_seqs(decoded_seqs)
    assert np.all(seqs == decoded_seqs)

    jagged_ohe_seqs = ohe_seqs(jagged_seqs)
    jagged_decoded_seqs = decode_seqs(jagged_ohe_seqs)

test_ohe_seqs()

One-hot encoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Reverse complementing sequences:   0%|          | 0/10 [00:00<?, ?it/s]

One-hot encoding sequences:   0%|          | 0/2 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/2 [00:00<?, ?it/s]

In [46]:
rc_ohes = eu.pp.reverse_complement_seqs(eu.pp.ohe_seqs(seqs))
rcs = eu.pp.decode_seqs(rc_ohes)

One-hot encoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

## Dinucleotide shuffle

In [20]:
def test_dinuc_shuffle_seqs():
    from eugene.preprocess import dinuc_shuffle_seq, dinuc_shuffle_seqs
    dnt_shuf_seq = dinuc_shuffle_seq(seq, num_shufs=10)
    assert np.all(seq != dnt_shuf_seq)
    assert(len(dnt_shuf_seq) == 10)
    dnt_shuf_seqs = dinuc_shuffle_seqs(seqs, num_shufs=10)
    assert np.all(seqs != dnt_shuf_seqs)
    assert(dnt_shuf_seqs.shape == (10, 10))
test_dinuc_shuffle_seqs()

## Perturb Seqs

In [29]:
ohe = eu.pp.ohe_seq(seq)
ohes = eu.pp.ohe_seqs(seqs)

One-hot encoding sequences:   0%|          | 0/10 [00:00<?, ?it/s]

In [35]:
def test_perturb_seqs():
    from eugene.preprocess import perturb_seq, perturb_seqs
    perturbed_seq = perturb_seq(ohe)
    assert perturbed_seq.shape == (30, 4, 10) 
    decoded_perturb = eu.pp.decode_seqs(perturbed_seq)
    assert len(decoded_perturb) == 30

    perturbed_seqs = perturb_seqs(ohes)
    assert perturbed_seqs.shape == (10, 30, 4, 10)
    decoded_perturbed_seq = eu.pp.decode_seqs(perturbed_seqs[0])
    assert len(decoded_perturbed_seq) == 30

test_perturb_seqs()


Decoding sequences:   0%|          | 0/30 [00:00<?, ?it/s]

Decoding sequences:   0%|          | 0/30 [00:00<?, ?it/s]

## Feature implant 

In [222]:
from eugene.preprocess._utils import _token2one_hot

In [302]:
# Prep data
meme = eu.dl.motif.MinimalMEME(path="../../_data/CPEs.meme")
motif = meme.motifs["TATA"]
name = motif.name
pfm = motif.pfm[4:8]
consensus = motif.consensus[4:8] 
pos = 2

In [304]:
def test_feature_implant
eu.pp.feature_implant_across_seq(seq, consensus, encoding="str")

array(['TATACCAGAC', 'CTATACAGAC', 'CATATAAGAC', 'CACTATAGAC',
       'CACATATAAC', 'CACACTATAC', 'CACACCTATA'], dtype='<U10')

In [305]:
eu.pp.feature_implant_seq(ohe, pfm.T, pos, encoding="onehot")

array([[0.   , 1.   , 0.01 , 0.968, 0.002, 0.992, 1.   , 0.   , 1.   ,
        0.   ],
       [1.   , 0.   , 0.002, 0.   , 0.014, 0.   , 0.   , 0.   , 0.   ,
        1.   ],
       [0.   , 0.   , 0.002, 0.   , 0.006, 0.002, 0.   , 1.   , 0.   ,
        0.   ],
       [0.   , 0.   , 0.986, 0.032, 0.978, 0.006, 0.   , 0.   , 0.   ,
        0.   ]])

In [261]:
eu.pp.feature_implant_across_seq(ohe, pfm.T, encoding="onehot")

array([[[0.01 , 0.968, 0.002, 0.992, 0.   , 0.   , 1.   , 0.   , 1.   ,
         0.   ],
        [0.002, 0.   , 0.014, 0.   , 1.   , 1.   , 0.   , 0.   , 0.   ,
         1.   ],
        [0.002, 0.   , 0.006, 0.002, 0.   , 0.   , 0.   , 1.   , 0.   ,
         0.   ],
        [0.986, 0.032, 0.978, 0.006, 0.   , 0.   , 0.   , 0.   , 0.   ,
         0.   ]],

       [[0.   , 0.01 , 0.968, 0.002, 0.992, 0.   , 1.   , 0.   , 1.   ,
         0.   ],
        [1.   , 0.002, 0.   , 0.014, 0.   , 1.   , 0.   , 0.   , 0.   ,
         1.   ],
        [0.   , 0.002, 0.   , 0.006, 0.002, 0.   , 0.   , 1.   , 0.   ,
         0.   ],
        [0.   , 0.986, 0.032, 0.978, 0.006, 0.   , 0.   , 0.   , 0.   ,
         0.   ]],

       [[0.   , 1.   , 0.01 , 0.968, 0.002, 0.992, 1.   , 0.   , 1.   ,
         0.   ],
        [1.   , 0.   , 0.002, 0.   , 0.014, 0.   , 0.   , 0.   , 0.   ,
         1.   ],
        [0.   , 0.   , 0.002, 0.   , 0.006, 0.002, 0.   , 1.   , 0.   ,
         0.   ],
        [0.   , 0

In [266]:
ohe

array([[0., 1., 0., 1., 0., 0., 1., 0., 1., 0.],
       [1., 0., 1., 0., 1., 1., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float16)

In [265]:
eu.pp.feature_implant_seq(ohe, pfm, pos, encoding="onehot", onehot=True)

array([[0., 1., 0., 1., 0., 1., 1., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0.]], dtype=float16)

In [267]:
eu.pp.feature_implant_across_seq(ohe, pfm, encoding="onehot", onehot=True)

array([[[0., 1., 0., 1., 0., 0., 1., 0., 1., 0.],
        [0., 0., 0., 0., 1., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [1., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
        [1., 0., 0., 0., 0., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 1., 0., 1., 1., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 1., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 1., 0., 1., 0., 1., 0.],
        [1., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 1., 0., 0., 0., 0.]],

       [[0., 1., 0., 1., 0., 1., 0., 1., 1., 0.],
        [1., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 1., 0., 0

In [3]:
sdata = eu.datasets.random1000()

In [4]:
eu.pp.one_hot_encode_data(sdata)

One-hot-encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added


In [5]:
# Prep data
model = eu.models.DeepBind(input_len=66, output_dim=1)
seq = sdata.seqs[0]
ohe_seq = eu.pp.ohe_DNA_seq(seq)
meme = eu.utils.MinimalMEME(path="../../_datasets/jores21/CPEs.meme")
motif = meme.motifs["TATA"]
name = motif.name
pfm = motif.pfm
consensus = eu.pp.decode_DNA_seq(pfm)
pos = 2

In [6]:
eu.pp.feature_implant_seq(seq, consensus, pos, encoding="str")

'AGCCCCTATAAATACCCCTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG'

In [7]:
eu.pp.feature_implant_across_seq(seq, consensus, encoding="str")

array(['CCCCTATAAATACCCCTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'ACCCCTATAAATACCCCGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGCCCCTATAAATACCCCTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGCCCCTATAAATACCCCTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACCCCTATAAATACCCCGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACCCCCTATAAATACCCCGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACACCCCTATAAATACCCCGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGCCCCTATAAATACCCCCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGACCCCTATAAATACCCCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATCCCCTATAAATACCCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATTCCCCTATAAATACCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATTTCCCCTATAAATACCCCACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATTTTCCCCTATAAATACCCCCGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',

In [8]:
eu.pp.feature_implant_seq(ohe_seq, pfm, pos, encoding="onehot")

array([[1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 1.    , 0.    ],
       [0.1275, 0.3765, 0.1195, 0.3765],
       [0.1575, 0.3985, 0.199 , 0.2455],
       [0.249 , 0.303 , 0.197 , 0.251 ],
       [0.1235, 0.655 , 0.0755, 0.1455],
       [0.01  , 0.002 , 0.002 , 0.986 ],
       [0.968 , 0.    , 0.    , 0.032 ],
       [0.002 , 0.014 , 0.006 , 0.978 ],
       [0.992 , 0.    , 0.002 , 0.006 ],
       [0.653 , 0.012 , 0.002 , 0.333 ],
       [0.974 , 0.    , 0.008 , 0.018 ],
       [0.341 , 0.028 , 0.036 , 0.5955],
       [0.6955, 0.0815, 0.1195, 0.1035],
       [0.1255, 0.432 , 0.3165, 0.1255],
       [0.291 , 0.418 , 0.175 , 0.1155],
       [0.263 , 0.3445, 0.1755, 0.2175],
       [0.307 , 0.3085, 0.2365, 0.1475],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 1.    , 0.    ],
       [0.    , 0.    , 1.    , 0.    ],
       [0.    , 0.    , 1.    , 0.    ],
       [0.    , 1.    , 0.    , 0.    ],
       [0.    , 

In [9]:
eu.pp.feature_implant_across_seq(ohe_seq, pfm, encoding="onehot")

array([[[0.1275, 0.3765, 0.1195, 0.3765],
        [0.1575, 0.3985, 0.199 , 0.2455],
        [0.249 , 0.303 , 0.197 , 0.251 ],
        ...,
        [1.    , 0.    , 0.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ]],

       [[1.    , 0.    , 0.    , 0.    ],
        [0.1275, 0.3765, 0.1195, 0.3765],
        [0.1575, 0.3985, 0.199 , 0.2455],
        ...,
        [1.    , 0.    , 0.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ]],

       [[1.    , 0.    , 0.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        [0.1275, 0.3765, 0.1195, 0.3765],
        ...,
        [1.    , 0.    , 0.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ]],

       ...,

       [[1.    , 0.    , 0.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        ...,
        [0.307 , 0.3085, 0.2365, 0.1475],
     

In [10]:
eu.pp.feature_implant_across_seq(ohe_seq, pfm, encoding="onehot", onehot=True)

array([[[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       [[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       [[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       ...,

       [[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]],

       [[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]],

       [[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1.

In [11]:
eu.interpret.feature_implant(
    model, 
    sdata, 
    seq_id=sdata.names[0], 
    feature=consensus, 
    feature_name=name, 
    encoding="str", 
    onehot=False, 
    device="cpu", 
    store=True
)

array([-0.12860738, -0.20724249, -0.09944402, -0.15735872, -0.07232916,
        0.10080706, -0.05799149, -0.00366257, -0.13281582, -0.03578924,
        0.07352825, -0.08969332, -0.14790955, -0.19986603, -0.17278814,
       -0.18217488, -0.07786699, -0.18028736, -0.13209815, -0.0564633 ,
       -0.15503126, -0.09165487, -0.11240477, -0.0220406 , -0.00690747,
       -0.05037049, -0.13428695, -0.07957186,  0.05299611, -0.00722817,
       -0.05692093, -0.06351749, -0.12362285, -0.04635581, -0.10504552,
        0.02596581,  0.10609262, -0.04033621, -0.09065762, -0.05444472,
       -0.05513809, -0.08035021, -0.02563847, -0.05098003,  0.05189031,
       -0.14998351,  0.04425012,  0.00295492, -0.0998994 , -0.09659025,
       -0.06869654], dtype=float32)

In [16]:
eu.interpret.feature_implant_sdata(
    model,
    sdata,
    feature=consensus,
    seqsm_key=f"{name}_slide",
    encoding="str",
    onehot=False,
    device="cpu"
)

Implanting feature:   0%|          | 0/1000 [00:00<?, ?it/s]

In [17]:
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = None
seqs_annot: 'target'
pos_annot: PyRanges object with 1400 features
seqsm: 'seq000_TATA_slide', 'TATA_slide'
uns: None

# Dataset preprocessing

In [66]:
sdata = eu.datasets.random1000()
eu.pp.ohe_seqs_sdata(sdata)
ohe_seqs = sdata.ohe_seqs
targets = sdata.seqs_annot["activity_0"].values

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added


In [70]:
def test_split_train_test(ohe_seqs, targets):
    train_seqs, test_seqs, train_targets, test_targets = eu.pp.split_train_test(ohe_seqs, targets)
    assert len(train_seqs) == len(train_targets)
    assert len(test_seqs) == len(test_targets)
    assert len(train_seqs) + len(test_seqs) == len(ohe_seqs)
    assert len(train_targets) + len(test_targets) == len(targets)
test_split_train_test(ohe_seqs, targets)

In [74]:
def test_standardize_features(ohe_seqs, targets):
    train_seqs, test_seqs, train_targets, test_targets = eu.pp.split_train_test(ohe_seqs, targets)
    standardized_train, standardized_test = eu.pp.standardize_features(train_seqs, test_seqs)
    assert standardized_train.shape == train_seqs.shape
    assert standardized_test.shape == test_seqs.shape
test_standardize_features()

None


In [75]:
def test_binarize_values(targets):
    binarized_targets = eu.pp.binarize_values(targets)
    assert binarized_targets.shape == targets.shape
test_binarize_values(targets)

# `SeqData` Functions

In [58]:
sdata = eu.datasets.random1000()

In [82]:
from eugene.dataload import SeqData
def test_sanitize_seqs_sdata(sdata):
    sdata_copy = eu.pp.sanitize_seqs_sdata(sdata, copy=True)
    assert isinstance(sdata_copy, SeqData)

def test_ohe_seqs_sdata(sdata):
    eu.pp.ohe_seqs_sdata(sdata)
    assert sdata.ohe_seqs is not None
    assert len(sdata.ohe_seqs) == len(sdata.seqs)

def test_reverse_complement_seqs_sdata(sdata):
    eu.pp.reverse_complement_seqs_sdata(sdata, rc_seqs=False, copy=False)

def test_clean_nan_targets_sdata(sdata):
    eu.pp.clean_nan_targets_sdata(sdata, target_keys="activity_0", copy=True)

def test_clamp_targets_sdata(sdata):
    eu.pp.train_test_split_sdata(sdata)
    eu.pp.clamp_targets_sdata(sdata, "activity_0", 0.8, "train_val", store_clamp_nums=True)
    assert sdata.uns["clamp_nums"] is not None

def test_scale_targets_sdata(sdata):
    eu.pp.train_test_split_sdata(sdata)
    eu.pp.scale_targets_sdata(sdata, "activity_0", "train_val", store_scaler=True)
    assert sdata
    assert sdata.uns["scaler"] is not None

def test_binarize_targets_sdata(sdata):
    eu.pp.binarize_targets_sdata(sdata, target_keys="activity_0", upper_threshold=0, suffix=True, copy=False)

def test_train_test_split_sdata(sdata):
    eu.pp.train_test_split_sdata(sdata)
    assert sdata["train_val"] is not None

def test_prepare_seqs_sdata(sdata):
    eu.pp.prepare_seqs_sdata(sdata)
    assert sdata.seqs is not None
    assert sdata.ohe_seqs is not None
    assert sdata.ohe_rev_seqs is not None
    assert len(sdata.seqs) == len(sdata.ohe_seqs) == len(sdata.ohe_rev_seqs)
    assert sdata["train_val"] is not None

test_sanitize_seqs_sdata(sdata)
test_ohe_seqs_sdata(sdata)
test_reverse_complement_seqs_sdata()
test_clean_nan_targets_sdata()
test_clamp_targets_sdata()
test_scale_targets_sdata()
test_binarize_targets_sdata()
test_train_test_split_sdata()
test_prepare_seqs_sdata()

Dropped targets: []


  f"X has feature names, but {self.__class__.__name__} was fitted without"


  0%|          | 0/3 [00:00<?, ?it/s]

---