# Implementing `in_silico_evolution` function

**Authorship:**
Adam Klie, *08/08/2022*
***
**Description:**
Notebook for testing the `in_silico_evolution` function

In [1]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [2]:
import eugene as eu

Global seed set to 13
2022-08-09 15:05:07.012163: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-09 15:05:07.012200: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Load data

In [184]:
sdata = eu.datasets.random1000()
eu.pp.prepare_data(sdata)
sdata

  0%|          | 0/3 [00:00<?, ?it/s]

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + train


SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'target', 'train'
pos_annot: PyRanges object with 1400 features
seqsm: None
uns: None

In [185]:
model = eu.models.DeepBind(input_len=66, output_dim=1)

In [468]:
import torch
from yuzu.naive_ism import naive_ism
from eugene import settings

#https://stackoverflow.com/questions/43386432/how-to-get-indexes-of-k-maximum-values-from-a-numpy-multidimensional-array
def k_largest_index_argsort(a, k):
    idx = np.argsort(a.ravel())[:-k-1:-1]
    return np.column_stack(np.unravel_index(idx, a.shape))


def in_silico_best_mut_seq(
    model,
    X,
    k=1,
    device="cpu"
) -> np.ndarray:
    X = np.expand_dims(X, axis=0) if X.ndim == 2 else X
    X = X.transpose(0, 2, 1) if X.shape[2] == 4 else X
    X = torch.Tensor(X).float().numpy()
    X_ism = naive_ism(model, X, device="cpu", batch_size=1)
    X_ism = X_ism.squeeze(axis=0)
    inds = k_largest_index_argsort(X_ism, k)
    locs = inds[:, 1]
    maxs = np.max(X_ism, axis=0)[locs]
    #print(inds, locs, maxs)
    #_max, ind = np.max(X_ism), np.unravel_index(X_ism.argmax(), X_ism.shape)
    mut_Xs = np.zeros((k, X.shape[2], X.shape[1]))
    #print(mut_Xs.shape)
    for i in range(k):
        mut_X = X.copy().transpose(0, 2, 1).squeeze(axis=0)
        mut_X[inds[i][1]] = np.zeros(mut_X.shape[1])
        mut_X[inds[i][1]][inds[i][0]] = 1
        #print(mut_X.shape)
        mut_Xs[i] = mut_X
    return mut_Xs, maxs, locs


def in_silico_best_mut_seqs(
    model,
    X,
    batch_size=None,
    device="cpu"
) -> np.ndarray:
    eu.settings.batch_size if batch_size is None else batch_size
    X = X.transpose(0, 2, 1) if X.shape[2] == 4 else X
    X = torch.Tensor(X).float().numpy()
    print(X.shape)
    X_ism = naive_ism(model, X, device="cpu", batch_size=batch_size)
    _max, _inds, mut_X = [], [], X.copy().transpose(0, 2, 1)
    print(mut_X.shape)
    for i in range(len(mut_X)):
        _max.append(np.max(X_ism[i]))
        ind = np.unravel_index(X_ism[i].argmax(), X_ism[i].shape)
        print(ind)
        _inds.append(ind[1])
        mut_X[i][ind[1]] = np.zeros(mut_X.shape[2])
        mut_X[i][ind[1]][ind[0]] = 1
    return mut_X, np.array(_max), np.array(_inds)


def in_silico_evolution(
    model,
    X,
    rounds=10,
    k=10,
    batch_size=None,
    device="cpu"
) -> np.ndarray:
    eu.settings.batch_size if batch_size is None else batch_size
    X = X.copy()
    mutated_positions, mutated_scores = [], []
    for r in range(rounds):
        print(f"Round {r}")
        if X.ndim == 2:
            mut_X, score, positions = in_silico_best_mut_seq(model, X, k=10, device=device)
            print(mut_X.shape, positions)
            for i, p in enumerate(positions):
                if p not in mutated_positions:
                    X = mut_X[i]
                    mutated_positions.append(p)
                    mutated_scores.append(score[i])
                    break
    return X, mutated_scores, mutated_positions

In [469]:
seq_num = 0
seqs = sdata.seqs[:32]
seq = seqs[seq_num]
ohe_seqs = sdata.ohe_seqs[:32]
ohe_seq = ohe_seqs[seq_num]
seq, ohe_seq.shape, ohe_seqs.shape

('AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 (66, 4),
 (32, 66, 4))

In [473]:
evolved_ohe_seq, deltas, delta_pos = in_silico_evolution(model, ohe_seq)
evolved_seq = eu.pp.decode_DNA_seq(evolved_ohe_seq)
for i in range(len(deltas)):
    print(deltas[i], delta_pos[i])
    print(seq[delta_pos[i]], evolved_seq[delta_pos[i]])
print(seq)
print(evolved_seq)
print(eu.preprocessing._utils._hamming_distance(seq, evolved_seq))

Round 0
(10, 66, 4) [31 50  8 56 50 55 61 35 49 16]
Round 1
(10, 66, 4) [ 3 31 31 55 16 55 56 35 38  3]
Round 2
(10, 66, 4) [ 3 16 31 23 38 54 58 26  8 40]
Round 3
(10, 66, 4) [ 3 35 31 16 23 13 38 31 54  8]
Round 4
(10, 66, 4) [35 35 31 23 40  3 13 31 47 54]
Round 5
(10, 66, 4) [31 54  3 13 23 35 22 23 35 31]
Round 6
(10, 66, 4) [55 31  3 35 54 38 35 16 22 47]
Round 7
(10, 66, 4) [ 3 55 35 31 53 47  8 35 36 36]
Round 8
(10, 66, 4) [ 3 35 53 35 31 53 36  7  8  2]
Round 9
(10, 66, 4) [36  3 53 38 31 42 31 36 36  7]
0.0013011694 31
A C
0.0013624579 3
A C
0.0010003597 16
T G
0.0010217875 35
G C
0.00096292794 23
C A
0.0010382086 54
G A
0.0011621714 55
A T
0.0009725094 53
C A
0.0009521693 36
C T
0.0011078417 38
T G
AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG
AGGCCAGATTTTCGCGGGTTGGGACCAACGGCTCACTCGCTATAAACCGTATCAATCAATATAAGG
10


In [370]:
test_ism = in_silico_best_mut_seq(model, ohe_seq)

In [385]:
def k_largest_index_argsort(a, k):
    idx = np.argsort(a.ravel())[:-k-1:-1]
    return np.column_stack(np.unravel_index(idx, a.shape))

In [423]:
mut_ohe_seq, delta, delta_ind = in_silico_best_mut_seq(model, ohe_seq, k=3)
mut_ohe_seq.shape

[[ 1 31]
 [ 1 50]
 [ 2  8]] [31 50  8] [0.00130117 0.00114632 0.0010823 ]
(3, 66, 4)
(66, 4)
(66, 4)
(66, 4)


(3, 66, 4)

In [424]:
mut_seq = eu.pp.decode_DNA_seq(mut_ohe_seq[2])
seq, mut_seq, eu.preprocessing._utils._hamming_distance(seq, mut_seq)

('AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 'AGGACAGAGTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 1)

In [418]:
mut_seq = eu.pp.decode_DNA_seq(mut_ohe_seq.squeeze(axis=0))
delta, seq[delta_ind], mut_seq[delta_ind], seq, mut_seq, eu.preprocessing._utils._hamming_distance(seq, mut_seq)

TypeError: only integer scalar arrays can be converted to a scalar index

In [323]:
mut_ohe_seqs, deltas, delta_inds = in_silico_best_mut_seqs(model, ohe_seqs, batch_size=32)
for i in range(len(mut_ohe_seqs)):
    mut_seq = eu.pp.decode_DNA_seq(mut_ohe_seqs[i])
    assert eu.pp._utils._hamming_distance(seqs[i], mut_seq) == 1
    if i < 3:
        print(deltas[i], delta_inds[i])
        print(seqs[i][delta_inds[i]], mut_seq[delta_inds[i]])
        print(seqs[i])
        print(mut_seq)
        print(eu.pp._utils._hamming_distance(seqs[i], mut_seq))
        print()

0.0013011694 31
A C
AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG
AGGACAGATTTTCGCGTGTTGGGCCCAACGGCTCAGCCTCTATAAACCGTATCCGACAATATAAGG
1

0.0012439638 28
A G
TGACTCCAGGAAGGACGTGTTGTTCGAGAAGAGCAGCCGAAAAAGTTAGCGATCGGCTCGACTTTC
TGACTCCAGGAAGGACGTGTTGTTCGAGGAGAGCAGCCGAAAAAGTTAGCGATCGGCTCGACTTTC
1

0.0010633916 40
G C
TATACTAGGAAGTCGACTTAAGAGATGCAAACAAAGAGTGGGGCATATTATCTAGCCAGACGTCAC
TATACTAGGAAGTCGACTTAAGAGATGCAAACAAAGAGTGCGGCATATTATCTAGCCAGACGTCAC
1



In [429]:
ohe_seq.shape

(66, 4)

[[ 1 31]
 [ 1 50]
 [ 2  8]
 [ 2 56]
 [ 2 50]
 [ 2 55]
 [ 2 61]
 [ 1 35]
 [ 1 49]
 [ 2 16]] [31 50  8 56 50 55 61 35 49 16] [0.00130117 0.00114632 0.0010823  0.00102121 0.00114632 0.0009747
 0.00093849 0.00093365 0.00093304 0.00093032]
(10, 66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4)
(66, 4) 31


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

('AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 'AGGCCAGATTTTCGCGTGTTGGGCCCAACGGCTCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 2)

---

# Scratch

In [None]:
seq, mut_seq

('TGACTCCAGGAAGGACGTGTTGTTCGAGAAGAGCAGCCGAAAAAGTTAGCGATCGGCTCGACTTTC',
 'TGACTCCAGGAAGGACGTGTTGTTCGAGGAGAGCAGCCGAAAAAGTTAGCGATCGGCTCGACTTTC')

In [None]:
eu.pp._utils._hamming_distance(seq, mut_seq)

1

In [None]:
mut_seq = eu.pp.decode_DNA_seq(mut_ohe_seq)
seq[ind[0]], mut_seq[ind[0]], seq[ind[0]-3:ind[0]+3], mut_seq[ind[0]-3:ind[0]+3]

('A', 'A', 'AAGAGC', 'GAGAGC')

In [None]:
x = sdata.ohe_seqs.transpose(0, 2, 1)
x = torch.Tensor(x).float().numpy()
yuzu_imps = naive_ism(model, x, device="cpu", batch_size=32)
yuzu_imps[0, :, :5].T

array([[0.0000000e+00, 2.8893352e-05, 7.3432922e-05, 9.3489885e-05],
       [4.0248036e-05, 1.9149482e-04, 0.0000000e+00, 6.0826540e-04],
       [4.8264861e-05, 3.5522878e-04, 0.0000000e+00, 3.5077333e-04],
       [0.0000000e+00, 9.0007484e-04, 7.3318183e-04, 5.9771538e-04],
       [7.8827143e-06, 0.0000000e+00, 4.6446919e-05, 1.5121698e-04]],
      dtype=float32)

((1000, 66, 4), (66, 4))

In [None]:
eu.interpret.feature_attribution(model, sdata, saliency_method="NaiveISM", batch_size=32)

Note: NaiveISM is not implemented yet for models other than single stranded ones
No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/31 [00:00<?, ?it/s]

In [None]:
imps = sdata.uns["NaiveISM_imps"]
imps[0, :, :5].T

array([[0.00000000e+00, 2.88933516e-05, 7.34329224e-05, 9.34898853e-05],
       [4.02480364e-05, 1.91494823e-04, 0.00000000e+00, 6.08265400e-04],
       [4.82648611e-05, 3.55228782e-04, 0.00000000e+00, 3.50773335e-04],
       [0.00000000e+00, 9.00074840e-04, 7.33181834e-04, 5.97715378e-04],
       [7.88271427e-06, 0.00000000e+00, 4.64469194e-05, 1.51216984e-04]])

In [None]:
x_imp = imps[0].T
x_imp.shape

(66, 4)

In [None]:
ind = np.unravel_index(x_imp.argmax(), x_imp.shape)
ind, np.max(x_imp)

((31, 1), 0.0013011693954467773)

In [None]:
mut_ohe_seq = ohe_seq.copy()
mut_ohe_seq[ind[0]] = np.zeros(mut_ohe_seq.shape[1])
mut_ohe_seq[ind[0]][ind[1]] = 1
ohe_seq[ind[0]], mut_ohe_seq[ind[0]]

(array([1., 0., 0., 0.]), array([0., 1., 0., 0.]))

In [None]:
mut_seq = eu.pp.decode_DNA_seq(mut_ohe_seq)
seq[ind[0]], mut_seq[ind[0]], seq[ind[0]-3:ind[0]+3], mut_seq[ind[0]-3:ind[0]+3]

('A', 'C', 'CGGATC', 'CGGCTC')

In [None]:
mut_seq

'AGGACAGATTTTCGCGTGTTGGGCCCAACGGCTCAGCCTCTATAAACCGTATCCGACAATATAAGG'

In [4]:
from eugene.preprocessing import perturb_seqs

In [5]:
all_possible_muts = perturb_seqs(sdata.ohe_seqs).flatten(start_dim=0, end_dim=1)

In [6]:
all_possible_muts.shape

torch.Size([260000, 66, 4])

In [36]:
eu.interpret.feature_attribution(model, sdata[:32], saliency_method="NaiveISM", batch_size=32)

Note: NaiveISM is not implemented yet for models other than single stranded ones
No transforms given, assuming just need to tensorize).


Computing saliency on batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[[0. 1. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 1. 0.]
  [1. 0. 1. ... 1. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 1.]
  [0. 1. 0. ... 0. 1. 0.]
  [0. 0. 1. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 1. 0. 0.]
  [1. 0. 1. ... 0. 1. 0.]]

 ...

 [[0. 1. 0. ... 0. 1. 1.]
  [0. 0. 1. ... 1. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 1. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 1. 1. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 1. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 1. ... 0. 1. 0.]]] (32, 4, 66)


In [52]:
naive_ism(model, torch.Tensor(sdata.ohe_seqs.transpose(0, 2, 1)).requires_grad_().detach().cpu().numpy(), batch_size=32)

ValueError: X_0 must be of type numpy.ndarray, not <class 'torch.Tensor'>

: 

In [41]:
print(sdata.ohe_seqs.transpose(0, 2, 1), sdata.ohe_seqs.transpose(0, 2, 1).shape)

[[[1. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 1. 1. ... 0. 1. 1.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 1. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 1. 1. 0.]]

 [[0. 1. 0. ... 0. 1. 0.]
  [0. 0. 0. ... 1. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  [1. 0. 1. ... 0. 0. 0.]]

 ...

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [1. 0. 1. ... 0. 1. 1.]]

 [[0. 1. 0. ... 1. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [1. 0. 0. ... 0. 1. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [1. 0. 1. ... 1. 0. 1.]
  [0. 0. 0. ... 0. 1. 0.]]] (1000, 4, 66)


In [17]:
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'target', 'train'
pos_annot: PyRanges object with 1400 features
seqsm: None
uns: 'NaiveISM_imps'

In [7]:
from eugene.interpret._feature_attribution import _naive_ism

In [17]:
sdata.ohe_seqs.transpose((2,1))

ValueError: axes don't match array

In [6]:
model(torch.from_numpy(sdata.ohe_seqs.transpose(0, 2, 1)).float())

NameError: name 'torch' is not defined

In [51]:
naive_ism(model, torch.Tensor(sdata.ohe_seqs.transpose(0, 2, 1)).requires_grad_().detach().cpu().numpy())

ValueError: X_0 must be of type numpy.ndarray, not <class 'torch.Tensor'>

In [46]:
from yuzu.utils import perturbations

import numpy
import torch

@torch.inference_mode()
def naive_ism(model, X_0, batch_size=128, device='cpu'):
    """In-silico mutagenesis saliency scores. 
    This function will perform in-silico mutagenesis in a naive manner, i.e.,
    where each input sequence has a single mutation in it and the entirety
    of the sequence is run through the given model. It returns the ISM score,
    which is a vector of the L2 difference between the reference sequence 
    and the perturbed sequences with one value for each output of the model.
    Parameters
    ----------
    model: torch.nn.Module
        The model to use.
    X_0: numpy.ndarray
        The one-hot encoded sequence to calculate saliency for.
    batch_size: int, optional
        The size of the batches.
    device: str, optional
        Whether to use a 'cpu' or 'gpu'.
    Returns
    -------
    X_ism: numpy.ndarray
        The saliency score for each perturbation.
    """

    n_seqs, n_choices, seq_len = X_0.shape
    X_idxs = X_0.argmax(axis=1)

    X = perturbations(X_0)
    X_0 = torch.from_numpy(X_0)

    if device[:4] != str(next(model.parameters()).device):
        model = model.to(device)

    if device[:4] != X_0.device:
        X_0 = X_0.to(device)

    model = model.eval()
    reference = model(X_0).unsqueeze(1)

    starts = numpy.arange(0, X.shape[1], batch_size)
    isms = []
    for i in range(n_seqs):
        X = perturbations(X_0[i])
        y = []

        for start in starts:
            X_ = X[0, start:start+batch_size]
            if device[:4] == 'cuda': 
                X_ = X_.to(device)
            
            y_ = model(X_)
            y.append(y_)
            del X_

        y = torch.cat(y)

        ism = torch.square(y - reference[i]).sum(axis=-1)
        if len(ism.shape) == 2:
            ism = ism.sum(axis=-1)
        ism = torch.sqrt(ism)
        isms.append(ism)

        if device[:4] == 'cuda':
            torch.cuda.synchronize()
            torch.cuda.empty_cache()

    isms = torch.stack(isms)
    isms = isms.reshape(n_seqs, seq_len, n_choices-1)

    j_idxs = torch.arange(n_seqs*seq_len)
    X_ism = torch.zeros(n_seqs*seq_len, n_choices, device=device)
    for i in range(1, n_choices):
        i_idxs = (X_idxs.flatten() + i) % n_choices
        X_ism[j_idxs, i_idxs] = isms[:, :, i-1].flatten()

    X_ism = X_ism.reshape(n_seqs, seq_len, n_choices).permute(0, 2, 1)

    if device[:4] == 'cuda':
        X_ism = X_ism.cpu()

    X_ism = X_ism.numpy()
    return X_ism

In [33]:
from yuzu.naive_ism import naive_ism
naive_ism(model, X_0=sdata.ohe_seqs.transpose(0, 2, 1), batch_size=32, device="cpu")

RuntimeError: expected scalar type Double but found Float

In [27]:
naive_ism?

[0;31mSignature:[0m [0mnaive_ism[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0mX_0[0m[0;34m,[0m [0mbatch_size[0m[0;34m=[0m[0;36m128[0m[0;34m,[0m [0mdevice[0m[0;34m=[0m[0;34m'cpu'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
In-silico mutagenesis saliency scores. 
This function will perform in-silico mutagenesis in a naive manner, i.e.,
where each input sequence has a single mutation in it and the entirety
of the sequence is run through the given model. It returns the ISM score,
which is a vector of the L2 difference between the reference sequence 
and the perturbed sequences with one value for each output of the model.
Parameters
----------
model: torch.nn.Module
    The model to use.
X_0: numpy.ndarray
    The one-hot encoded sequence to calculate saliency for.
batch_size: int, optional
    The size of the batches.
device: str, optional
    Whether to use a 'cpu' or 'gpu'.
Returns
-------
X_ism: numpy.ndarray
    The saliency score for each perturb

In [22]:
print(type(sdata.ohe_seqs.transpose(0, 2, 1)))

<class 'numpy.ndarray'>


In [15]:
in_silico_mutagenesis(model, sdata.ohe_seqs.transpose(0, 2, 1), batch_size=128)

ValueError: X_0 must have three dimensions: (n_seqs, n_choices, seq_len).

In [25]:
import torch

In [36]:
model(torch.Tensor(all_possible_muts.transpose(1,2).float()))

: 

: 