# Testing `DeepSEA` model class

**Authorship:**
Adam Klie, *07/31/2022*
***
**Description:**
Notebook for testing out the sota `DeepSEA` model class.

In [1]:
import numpy as np
import pandas as pd
import torch

import pytorch_lightning as pl

import eugene as eu
from torch.utils.data import Dataset, DataLoader

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

Global seed set to 13
Global seed set to 13
Global seed set to 13


In [2]:
from eugene.models import DeepSEA

In [3]:
# Length of strand
x_len = 1000

# Substitute x and x_rev for their respective data
x = torch.randn(10, 4, x_len)
x_rev = torch.randn(10, 4, x_len)

# Simple usage of DeepSEA module

The DeepBind module only requires one parameter to function. It can be further customized with optional parameters.

In [4]:
simple_deepsea_instance = DeepSEA(input_len=x_len)

simple_deepsea_instance

DeepSEA(
  (module): Sequential(
    (0): Conv1d(4, 320, kernel_size=(8,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (3): Dropout(p=0.2, inplace=False)
    (4): Conv1d(320, 480, kernel_size=(8,), stride=(1,))
    (5): ReLU()
    (6): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.2, inplace=False)
    (8): Conv1d(480, 960, kernel_size=(8,), stride=(1,))
    (9): ReLU()
    (10): Dropout(p=0.2, inplace=False)
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=50880, out_features=1, bias=True)
    )
  )
  (r_squared): R2Score()
)

In [5]:
# x_rev does not need to be specified in single strand mode
out = simple_deepsea_instance(x)

out, out.shape

(tensor([[ 0.0167],
         [ 0.0913],
         [ 0.0399],
         [-0.0184],
         [ 0.1422],
         [-0.0843],
         [-0.0960],
         [ 0.0060],
         [ 0.0312],
         [ 0.3167]], grad_fn=<AddmmBackward0>),
 torch.Size([10, 1]))

# Training with random dataset

In [9]:
sdata = eu.datasets.random1000()

In [10]:
eu.pp.reverse_complement_data(sdata)

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added


In [11]:
eu.pp.one_hot_encode_data(sdata)

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added


In [12]:
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'TARGETS'
pos_annot: PyRanges object with 1456 features

In [13]:
sdataset = sdata.to_dataset(label="TARGETS", seq_transforms=[], transform_kwargs={"transpose": True})

In [14]:
sdataset[0]

(tensor([115., 101., 113.,  48.,  48.,  49.,  36.]),
 tensor([[0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
          0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
          0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
          1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0.],
         [0., 1., 0., 0., 0., 1.

In [15]:
batch_size = 100

sdataloader = DataLoader(sdataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)

In [16]:
simple_deepbind_instance = DeepBind(input_len=x_len)

trainer = pl.Trainer(accelerator = "cpu", devices = 1)
trainer.fit(simple_deepbind_instance, sdataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name      | Type                      | Params
--------------------------------------------------------
0 | max_pool  | MaxPool1d                 | 0     
1 | avg_pool  | AvgPool1d                 | 0     
2 | convnet   | BasicConv1D               | 272   
3 | fcn       | BasicFullyConnectedModule | 146 K 
4 | r_squared | R2Score                   | 0     
--------------------------------------------------------
147 K     Trainable params
0         Non-trainable params
147 K     Total params
0.588     Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

In [None]:
simple_deepbind_instance

DeepBind(
  (max_pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avg_pool): AvgPool1d(kernel_size=(4,), stride=(4,), padding=(0,))
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(4,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): Dropout(p=0.2, inplace=False)
    )
  )
  (fcn): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=504, out_features=256, bias=True)
      (1): ReLU(inplace=True)
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=256, out_features=64, bias=True)
      (4): ReLU(inplace=True)
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=64, out_features=16, bias=True)
      (7): ReLU(inplace=True)
      (8): Dropout(p=0.2, inplace=False)
      (9): Linear(in_features=16, out_features=4, bias=True)
      (10): ReLU(inplace=True)
      (11): Dropout(p=0.2, inplace=False)
      (12): Linear(in_features=4, out_feature

---

# Scratch