# Testing implanting a feature into sequences

**Authorship:**
Adam Klie, *09/01/2022*
***
**Description:**
Notebook to test adding a feature to sequences.
***

In [16]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import torch
import numpy as np
import eugene as eu
eu.__version__

'0.1.0'

In [2]:
sdata = eu.datasets.random1000()

In [3]:
eu.pp.ohe_seqs_sdata(sdata)

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added


In [4]:
# Prep data
model = eu.models.DeepBind(input_len=66, output_dim=1)
seq = sdata.seqs[0]
ohe_seq = eu.pp.ohe_seq(seq)
meme = eu.dl.motif.MinimalMEME(path="../../_data/CPEs.meme")
motif = meme.motifs["TATA"]
name = motif.name
pfm = motif.pfm
consensus = motif.consensus
pos = 2

In [5]:
eu.pp.feature_implant_seq(seq, consensus, pos, encoding="str")

'AGCCCCTATAAATACCCCTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG'

In [6]:
eu.pp.feature_implant_across_seq(seq, consensus, encoding="str")

array(['CCCCTATAAATACCCCTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'ACCCCTATAAATACCCCGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGCCCCTATAAATACCCCTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGCCCCTATAAATACCCCTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACCCCTATAAATACCCCGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACCCCCTATAAATACCCCGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACACCCCTATAAATACCCCGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGCCCCTATAAATACCCCCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGACCCCTATAAATACCCCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATCCCCTATAAATACCCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATTCCCCTATAAATACCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATTTCCCCTATAAATACCCCACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
       'AGGACAGATTTTCCCCTATAAATACCCCCGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',

In [7]:
eu.pp.feature_implant_seq(ohe_seq, pfm, pos, encoding="onehot").transpose()

array([[1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 1.    , 0.    ],
       [0.1275, 0.3765, 0.1195, 0.3765],
       [0.1575, 0.3985, 0.199 , 0.2455],
       [0.249 , 0.303 , 0.197 , 0.251 ],
       [0.1235, 0.655 , 0.0755, 0.1455],
       [0.01  , 0.002 , 0.002 , 0.986 ],
       [0.968 , 0.    , 0.    , 0.032 ],
       [0.002 , 0.014 , 0.006 , 0.978 ],
       [0.992 , 0.    , 0.002 , 0.006 ],
       [0.653 , 0.012 , 0.002 , 0.333 ],
       [0.974 , 0.    , 0.008 , 0.018 ],
       [0.341 , 0.028 , 0.036 , 0.5955],
       [0.6955, 0.0815, 0.1195, 0.1035],
       [0.1255, 0.432 , 0.3165, 0.1255],
       [0.291 , 0.418 , 0.175 , 0.1155],
       [0.263 , 0.3445, 0.1755, 0.2175],
       [0.307 , 0.3085, 0.2365, 0.1475],
       [0.    , 0.    , 1.    , 0.    ],
       [1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 

In [8]:
eu.pp.feature_implant_across_seq(ohe_seq, pfm, encoding="onehot").transpose(0, 2, 1)[0]

array([[0.1275, 0.3765, 0.1195, 0.3765],
       [0.1575, 0.3985, 0.199 , 0.2455],
       [0.249 , 0.303 , 0.197 , 0.251 ],
       [0.1235, 0.655 , 0.0755, 0.1455],
       [0.01  , 0.002 , 0.002 , 0.986 ],
       [0.968 , 0.    , 0.    , 0.032 ],
       [0.002 , 0.014 , 0.006 , 0.978 ],
       [0.992 , 0.    , 0.002 , 0.006 ],
       [0.653 , 0.012 , 0.002 , 0.333 ],
       [0.974 , 0.    , 0.008 , 0.018 ],
       [0.341 , 0.028 , 0.036 , 0.5955],
       [0.6955, 0.0815, 0.1195, 0.1035],
       [0.1255, 0.432 , 0.3165, 0.1255],
       [0.291 , 0.418 , 0.175 , 0.1155],
       [0.263 , 0.3445, 0.1755, 0.2175],
       [0.307 , 0.3085, 0.2365, 0.1475],
       [0.    , 1.    , 0.    , 0.    ],
       [1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 1.    , 0.    ],
       [1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 0.    , 0.    , 1.    ],
       [0.    , 

In [9]:
eu.pp.feature_implant_across_seq(ohe_seq, pfm, encoding="onehot", onehot=True)

array([[[0., 0., 0., ..., 1., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 1., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 1., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 1., 1., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 1., 0.],
        [0., 1., 1., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 1., 1.],
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float16)

In [10]:
from eugene.preprocess._utils import _token2one_hot

In [26]:
pfm.shape[0]

16

In [28]:
ohe_seq.shape[0]

4

In [25]:
_token2one_hot(pfm.argmax(axis=1))

array([[0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]], dtype=int8)

In [12]:
sdata_sub =sdata[:32]

In [18]:
eu.interpret.feature_implant_seq_sdata(
    model, 
    sdata_sub, 
    seq_id=sdata_sub.names[0], 
    feature=consensus, 
    feature_name=name, 
    encoding="str", 
    onehot=False, 
    device="cpu", 
    store=False
)

array([ 0.2840202 ,  0.13159242,  0.15137582,  0.2257209 ,  0.1829635 ,
        0.0850001 ,  0.14489007,  0.15463647,  0.25749475,  0.25224042,
        0.27782148,  0.31995624,  0.11487965,  0.34288234,  0.21865895,
        0.06119195,  0.1654991 ,  0.04208153,  0.25275224,  0.01562067,
        0.15343966,  0.04600662,  0.10595044,  0.10485399,  0.15623318,
       -0.07080616,  0.29702222,  0.22360967,  0.14814068,  0.3015874 ,
        0.02407159,  0.18645068,  0.11784181,  0.18216039,  0.23763016,
        0.17713389,  0.24685998,  0.0863037 ,  0.13618197,  0.18957342,
        0.30713362,  0.29060775,  0.19423166, -0.01986884,  0.16932395,
        0.04566042,  0.31782   ,  0.33257705,  0.15236509,  0.32092166,
        0.01561582], dtype=float32)

In [20]:
eu.interpret.feature_implant_seqs_sdata(
    model,
    sdata_sub,
    feature=consensus,
    seqsm_key=f"{name}_slide",
    encoding="str",
    onehot=False,
    device="cpu"
)

[autoreload of eugene.interpret._in_silico failed: Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/vscode/.local/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 410, in superreload
    update_generic(old_obj, new_obj)
  File "/home/vscode/.local/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 347, in update_generic
    update(a, b)
  File "/home/vscode/.local/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 266, in update_function
    setattr(old, name, getattr(new, name))
ValueError: feature_implant_seqs_sdata() requires a code object with 1 free vars, not 0
]


Implanting feature in all seqs of sdata:   0%|          | 0/32 [00:00<?, ?it/s]

[ 0.13945583  0.14916076  0.1143868   0.12109571  0.10734119  0.32635742
  0.25391227  0.10125061  0.23802157  0.19200782  0.18360914  0.07214929
  0.2777334   0.1810311   0.22851254  0.19759738  0.19483317  0.26029253
  0.16783315  0.1755469   0.05577534  0.18694304  0.08489668  0.0877078
  0.19585054  0.37052083  0.16390066 -0.02534203  0.10956874  0.06439628
  0.12784773  0.05557682  0.2773698   0.08432941  0.22512048  0.23845126
  0.22319284  0.12008636  0.09383175  0.15286939  0.24958248  0.19724339
  0.256887    0.21864878  0.09755334  0.12350366  0.16274306  0.15714581
  0.22879297  0.14552264  0.16242336]
[ 3.00215304e-01  1.25221476e-01  1.51940629e-01  2.76171505e-01
  1.44046500e-01  1.92814663e-01  2.10057631e-01  3.28698516e-01
  9.13791284e-02  1.40887022e-01  1.95051506e-01 -1.75833702e-05
  2.01205030e-01  2.23088592e-01  9.56121087e-02  2.33667850e-01
  1.40090197e-01  2.39381343e-01  3.64337116e-02  1.78422287e-01
  3.07758033e-01  1.17617115e-01  1.97365418e-01  1.99

In [22]:
sdata.seqsm["TATA_slide"].shape

(32, 51)

---

# Testing `in_silico_evolution` function

**Authorship:**
Adam Klie, *08/08/2022*
***
**Description:**
Notebook for testing the `in_silico_evolution` function

In [1]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [2]:
import eugene as eu

Global seed set to 13
2022-09-11 04:54:27.500841: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-11 04:54:27.674315: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-11 04:54:27.674348: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-11 04:54:27.705632: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-11 04:54:28.309745: W tensorfl

# Data E-T-L

In [3]:
sdata = eu.datasets.random1000()
eu.pp.prepare_seqs_sdata(sdata)
sdata

  0%|          | 0/3 [00:00<?, ?it/s]

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + train_val


SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = (1000, 4, 66)
ohe_rev_seqs = (1000, 4, 66)
seqs_annot: 'target', 'train_val'
pos_annot: PyRanges object with 1400 features
seqsm: None
uns: None

# Model I&I

In [4]:
model = eu.models.DeepBind(input_len=66, output_dim=1)
eu.models.init_weights(model)

# Test API

In [5]:
# Set-up of some sequences to test
seq_num = np.random.choice(32, size=1, replace=False).squeeze()
seqs = sdata.seqs[:32]
seq = seqs[seq_num]
ohe_seqs = sdata.ohe_seqs[:32]
ohe_seq = ohe_seqs[seq_num]
seq, ohe_seq.shape, ohe_seqs.shape

('TCATTAGATTGGGTTGCTGTTTAGCAGGACCATATCCGGAGGCTTTAATGTTACCCGGCAGTGCTT',
 (4, 66),
 (32, 4, 66))

## Test `in_silico_best_k_muts`

In [9]:
mut_ohe_seq, delta, delta_ind = eu.interpret.best_k_muts(model, ohe_seq, k=1)
mut_seq = eu.pp.decode_seq(mut_ohe_seq.squeeze(axis=0))
seq[delta_ind.squeeze()], mut_seq[delta_ind.squeeze()], seq, mut_seq, eu.pp._utils._hamming_distance(seq, mut_seq)

[[ 2 61]] [61] [0.00944884]
(4, 66)


('T',
 'G',
 'TCATTAGATTGGGTTGCTGTTTAGCAGGACCATATCCGGAGGCTTTAATGTTACCCGGCAGTGCTT',
 'TCATTAGATTGGGTTGCTGTTTAGCAGGACCATATCCGGAGGCTTTAATGTTACCCGGCAGGGCTT',
 1)

## Test `in_silico_best_mut_seqs`

In [10]:
mut_ohe_seqs, deltas, delta_inds = eu.interpret.best_mut_seqs(model, ohe_seqs, batch_size=32)
for i in range(len(mut_ohe_seqs)):
    mut_seq = eu.pp.decode_seq(mut_ohe_seqs[i])
    assert eu.pp._utils._hamming_distance(seqs[i], mut_seq) == 1
    if i < 3:
        print(deltas[i], delta_inds[i])
        print(seqs[i][delta_inds[i]], mut_seq[delta_inds[i]])
        print(seqs[i])
        print(mut_seq)
        print(eu.pp._utils._hamming_distance(seqs[i], mut_seq))
        print()

0.015379578 55
A C
AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG
AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGCCAATATAAGG
1

0.025349513 50
G C
TGACTCCAGGAAGGACGTGTTGTTCGAGAAGAGCAGCCGAAAAAGTTAGCGATCGGCTCGACTTTC
TGACTCCAGGAAGGACGTGTTGTTCGAGAAGAGCAGCCGAAAAAGTTAGCCATCGGCTCGACTTTC
1

0.024218425 13
C A
TATACTAGGAAGTCGACTTAAGAGATGCAAACAAAGAGTGGGGCATATTATCTAGCCAGACGTCAC
TATACTAGGAAGTAGACTTAAGAGATGCAAACAAAGAGTGGGGCATATTATCTAGCCAGACGTCAC
1



## Test `in_silico_evolution`

In [11]:
evolved_ohe_seq, deltas, delta_pos = eu.interpret.evolution(
    model, 
    ohe_seq, 
    force_different=True)

evolved_seq = eu.pp.decode_seq(evolved_ohe_seq)
for i in range(len(deltas)):
    print(deltas[i], delta_pos[i])
    print(seq[delta_pos[i]], evolved_seq[delta_pos[i]])
print(seq)
print(evolved_seq)
print(eu.pp._utils._hamming_distance(seq, evolved_seq))

[[ 2 61]
 [ 0 40]
 [ 0 61]
 [ 3 56]
 [ 3 57]
 [ 0  6]
 [ 3 23]
 [ 2  5]
 [ 1 39]
 [ 0  3]] [61 40 61 56 57  6 23  5 39  3] [0.00944884 0.00889377 0.00944884 0.00746579 0.00596726 0.00487195
 0.00419958 0.00401482 0.00401039 0.00396453]
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
[[ 0 40]
 [ 3 56]
 [ 0  6]
 [ 3 57]
 [ 2  5]
 [ 0  3]
 [ 1 32]
 [ 2  1]
 [ 1 62]
 [ 1 43]] [40 56  6 57  5  3 32  1 62 43] [0.01000783 0.00768696 0.00515807 0.0044408  0.00430094 0.00425066
 0.00414926 0.0038837  0.00353384 0.0033012 ]
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
[[ 3 56]
 [ 2 35]
 [ 0 35]
 [ 3 23]
 [ 1 32]
 [ 3 27]
 [ 2  1]
 [ 1 27]
 [ 0 38]
 [ 0  6]] [56 35 35 23 32 27  1 27 38  6] [0.00799286 0.00742814 0.00742814 0.00555712 0.00519584 0.00423892
 0.00360246 0.00423892 0.00344777 0.00329013]
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
(4, 66)
[[ 2 35]
 [ 3 23]
 [ 1 32]
 [ 0 35]
 [ 2  1]
 [ 3 11

In [12]:
sdata_subset = sdata[:32]

In [20]:
evolved_seqs = eu.interpret.evolve_seqs_sdata(
    model,
    sdata_subset,
    rounds=5,
    force_different=True,
    return_seqs=True
)

Evolving seqs:   0%|          | 0/32 [00:00<?, ?it/s]

SeqData object modified:
    seqs_annot:
        + evolved_5_scores


In [21]:
sdata_subset.seqs_annot

Unnamed: 0,target,train_val,original_scores,evolved_3_scores,evolved_5_scores
seq000,0.866168,True,0.174413,0.214282,0.235417
seq001,0.800737,True,0.147729,0.203262,0.223784
seq002,0.703108,True,0.155737,0.2027,0.219492
seq003,0.465782,False,0.154625,0.197601,0.215584
seq004,0.676781,False,0.158724,0.207494,0.225771
seq005,0.310314,True,0.135241,0.197334,0.227165
seq006,0.954574,True,0.151677,0.20496,0.224981
seq007,0.004132,True,0.157992,0.20527,0.223052
seq008,0.030239,True,0.214875,0.239488,0.253772
seq009,0.319772,True,0.183135,0.223346,0.236756


---

# Scratch