# Implementing implanting a feature into sequences

**Authorship:**
Adam Klie, *09/01/2022*
***
**Description:**
Notebook to implement adding a feature to sequences.
***

In [1]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

# Basic import
import torch
import numpy as np
import eugene as eu
eu.__version__

Global seed set to 13
2022-09-01 23:55:44.798344: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-01 23:55:44.798392: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)


'0.1.0'

In [2]:
sdata = eu.datasets.random1000()

In [3]:
eu.pp.one_hot_encode_data(sdata)

One-hot-encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added


In [4]:
from tqdm.auto import tqdm
from eugene.preprocessing._utils import _token2one_hot
from eugene.preprocessing import ohe_alphabet_seqs
from eugene._settings import settings
def feature_implant_seq(seq, feature, position, encoding="str", onehot=False):
    """
    Insert a feature at a given position in a sequence.
    """
    if encoding == "str":
        return seq[:position] + feature + seq[position + len(feature):]
    elif encoding == "onehot":
        if onehot:
            feature = _token2one_hot(feature.argmax(axis=1), vocab_size=4, fill_value=0)
        return np.concatenate(
            (seq[:position], feature, seq[position + len(feature):]), axis=0
        )
    else:
        raise ValueError("Encoding not recognized.")


def feature_implant_across_seq(seq, feature, **kwargs):
    """
    Insert a feature at every position in a sequence.
    """
    implanted_seqs = []
    for pos in range(len(seq) - len(feature)):
        implanted_seqs.append(feature_implant_seq(seq, feature, pos, **kwargs))
    return np.array(implanted_seqs)


def feature_implant(model, sdata, seq_id, feature, feature_name="feature", encoding="str", onehot=False, device="cpu", store=False):
    """
    Score a set of sequences with a feature inserted at every position of each sequence in sdata
    """
    device = "cuda" if settings.gpus > 0 else "cpu" if device is None else device
    model.to(device)
    seq_idx = np.where(sdata.seqs_annot.index == seq_id)[0][0]
    if encoding == "str":
        seq = sdata.seqs[seq_idx]
        implanted_seqs = feature_implant_across_seq(seq, feature, encoding=encoding)
        implanted_seqs = ohe_alphabet_seqs(implanted_seqs)
        X = torch.from_numpy(implanted_seqs).transpose(1, 2).float()
    elif encoding == "onehot":
        seq = sdata.ohe_seqs[seq_idx]
        implanted_seqs = feature_implant_across_seq(
            seq, feature, encoding=encoding, onehot=onehot
        )
        X = torch.from_numpy(implanted_seqs).transpose(1, 2).float()
    else:
        raise ValueError("Encoding not recognized.")
    X = X.to(device)
    preds = model(X).detach().numpy().squeeze()
    if store:
        sdata.seqsm[f"{seq_id}_{feature_name}_slide"] = preds
    return preds


def feature_implant_sdata(model, sdata, seqsm_key=None, **kwargs):
    """
    Score a set of sequences with a feature inserted at every position of each sequence in sdata
    """
    predictions = []
    for i, seq_id in tqdm(enumerate(sdata.seqs_annot.index), desc="Implanting feature", total=len(sdata.seqs_annot)):
        predictions.append(feature_implant(model, sdata, seq_id, **kwargs))
    print(seqsm_key)
    if seqsm_key is not None:
        sdata.seqsm[seqsm_key] = np.array(predictions)

In [5]:
# Prep data
model = eu.models.DeepBind(input_len=66, output_dim=1)
seq = sdata.seqs[0]
ohe_seq = eu.pp.ohe_DNA_seq(seq)
meme = eu.utils.MinimalMEME(path="../../_datasets/jores21/CPEs.meme")
motif = meme.motifs["TATA"]
name = motif.name
pfm = motif.pfm
consensus = eu.pp.decode_DNA_seq(pfm)
pos = 2

In [8]:
# Test function for single seq
implanted_seq = feature_implant_seq(seq, consensus, pos, encoding="str")
implanted_ohe_seq = feature_implant_seq(ohe_seq, pfm, pos, encoding="onehot")
implanted_full_ohe_seq = feature_implant_seq(ohe_seq, pfm, pos, encoding="onehot", onehot=True)
seq, implanted_seq, ohe_seq[0:10], implanted_ohe_seq[0:10], implanted_full_ohe_seq[0:10], ohe_seq[0:10]

('AGGACAGATTTTCGCGTGTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 'AGCCCCTATAAATACCCCTTGGGCCCAACGGATCAGCCTCTATAAACCGTATCCGACAATATAAGG',
 array([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]], dtype=float16),
 array([[1.    , 0.    , 0.    , 0.    ],
        [0.    , 0.    , 1.    , 0.    ],
        [0.1275, 0.3765, 0.1195, 0.3765],
        [0.1575, 0.3985, 0.199 , 0.2455],
        [0.249 , 0.303 , 0.197 , 0.251 ],
        [0.1235, 0.655 , 0.0755, 0.1455],
        [0.01  , 0.002 , 0.002 , 0.986 ],
        [0.968 , 0.    , 0.    , 0.032 ],
        [0.002 , 0.014 , 0.006 , 0.978 ],
        [0.992 , 0.    , 0.002 , 0.006 ]]),
 array([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
   

In [9]:
# Test function to implant across a seq
implanted_seqs = feature_implant_across_seq(seq, consensus)
implanted_ohe_seqs = feature_implant_across_seq(ohe_seq, pfm, encoding="onehot")
implanted_full_ohe_seqs = feature_implant_across_seq(ohe_seq, pfm, encoding="onehot", onehot=True)
len(implanted_seqs), implanted_ohe_seqs.shape, implanted_full_ohe_seqs.shape

(50, (50, 66, 4), (50, 66, 4))

In [10]:
# Test function for single seq in sdata
predictions = feature_implant(
    model=model, 
    sdata=sdata, 
    feature=pfm, 
    seq_id=sdata.names[0],
    encoding="onehot", 
    onehot=True,
    store=True,
    feature_name=name
)
sdata.seqsm["seq000_TATA_slide"]

array([-0.12860738, -0.20724249, -0.09944402, -0.15735872, -0.07232916,
        0.10080706, -0.05799149, -0.00366257, -0.13281582, -0.03578924,
        0.07352825, -0.08969332, -0.14790955, -0.19986603, -0.17278814,
       -0.18217488, -0.07786699, -0.18028736, -0.13209815, -0.0564633 ,
       -0.15503126, -0.09165487, -0.11240477, -0.0220406 , -0.00690747,
       -0.05037049, -0.13428695, -0.07957186,  0.05299611, -0.00722817,
       -0.05692093, -0.06351749, -0.12362285, -0.04635581, -0.10504552,
        0.02596581,  0.10609262, -0.04033621, -0.09065762, -0.05444472,
       -0.05513809, -0.08035021, -0.02563847, -0.05098003,  0.05189031,
       -0.14998351,  0.04425012,  0.00295492, -0.0998994 , -0.09659025],
      dtype=float32)

In [11]:
# Test function for whole sdata object
feature_implant_sdata(
    model=model, 
    sdata=sdata, 
    feature=pfm, 
    seqsm_key=f"slide_{name}",
    encoding="onehot", 
    onehot=True
)
sdata.seqsm[f"slide_{name}"].shape

Implanting feature:   0%|          | 0/1000 [00:00<?, ?it/s]

slide_TATA


(1000, 50)

---

# Scratch