# Setup of file and variables

In [1]:
import numpy as np
import pandas as pd
import torch

import pytorch_lightning as pl

import eugene as eu
from torch.utils.data import Dataset, DataLoader

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

Global seed set to 13
Global seed set to 13
Global seed set to 13


In [2]:
from eugene.models import DeepSEA

In [3]:
# Length of strand
x_len = 1000

# Substitute x and x_rev for their respective data
x = torch.randn(10, 4, x_len)
x_rev = torch.randn(10, 4, x_len)

# Simple usage of DeepSEA module

The DeepBind module only requires one parameter to function. It can be further customized with optional parameters.

In [4]:
simple_deepsea_instance = DeepSEA(input_len=x_len)

simple_deepsea_instance

DeepSEA(
  (module): Sequential(
    (0): Conv1d(4, 320, kernel_size=(8,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (3): Dropout(p=0.2, inplace=False)
    (4): Conv1d(320, 480, kernel_size=(8,), stride=(1,))
    (5): ReLU()
    (6): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
    (7): Dropout(p=0.2, inplace=False)
    (8): Conv1d(480, 960, kernel_size=(8,), stride=(1,))
    (9): ReLU()
    (10): Dropout(p=0.2, inplace=False)
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=50880, out_features=1, bias=True)
    )
  )
  (r_squared): R2Score()
)

In [5]:
# x_rev does not need to be specified in single strand mode
out = simple_deepsea_instance(x)

out, out.shape

RuntimeError: mat1 and mat2 shapes cannot be multiplied (9600x53 and 50880x1)

# DeepBind Parameters

- input_len : int - length of input strand
- strand : string - type of strand to process, only  || default : "ss" || res: single ("ss"), double ("ds"), twin ("ts")
- task : string - task for the model to perform || default : "regression" || res: "regression", "binary_classification"

#### conv_kwargs

- channels : list of ints - amount of channels for each convolutional layer || default : [4, 16]
- conv_kernels : list of int(s) - size of convolutional kernels || default : [4] || res : length must be 1 less than that of channels
- pool_kernels : list of int(s) - size of max pooling kernels || default : [4] || res: length must be 1 less than that of channels, only applies is length of channels is > 2
- omit_final_pool : boolean - bypass final max pooling step of output || default : True
- dropout_rates : float - probability for the dropout of any given node || default : 0.2
- batchnorm : boolean - enable batch normalization between layers || default : False

#### mp_kwargs

- kernel_size : int - motif pooling kernel size || default : 4 || res : multiples of 2

#### fc_kwargs

- output_dim : int - number of output dimensions for the network || default : 1
- hidden_dims : list of int(s) - width of each hidden dimension || default : [256, 64, 16, 4]
- dropout_rate : float - probability for the dropout of any given node || default : 0.2
- batchnorm : boolean - enable batch normalization between layers || default : False

In [7]:
customized_deepbind_instance = DeepBind(
    input_len=x_len,
    strand="ts",
    mp_kwargs=dict(kernel_size=16),
    conv_kwargs=dict(channels=[4, 16, 32], conv_kernels=[4, 4], pool_kernels=[4, 4], omit_final_pool=False, dropout_rates=0.5, batchnorm=True),
    fc_kwargs=dict(output_dim=5, hidden_dims=[256, 128, 64, 32, 16], dropout_rate=0.3, batchnorm=True)
)

customized_deepbind_instance

DeepBind(
  (max_pool): MaxPool1d(kernel_size=16, stride=16, padding=0, dilation=1, ceil_mode=False)
  (avg_pool): AvgPool1d(kernel_size=(16,), stride=(16,), padding=(0,))
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(4,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (3): Dropout(p=0.5, inplace=False)
      (4): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): Conv1d(16, 32, kernel_size=(4,), stride=(1,))
      (6): ReLU()
      (7): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
      (8): Dropout(p=0.5, inplace=False)
      (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (fcn): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=12, out_features=256, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.3, inplace=Fals

In [8]:
# x_rev must be specified in double and twin strand modes
out = customized_deepbind_instance(x, x_rev)

out, out.shape

(tensor([[-0.3034],
         [ 0.1875],
         [ 0.0155],
         [ 0.1738],
         [-0.1377],
         [ 0.0516],
         [-0.2159],
         [-0.0738],
         [ 0.0833],
         [ 0.0272]], grad_fn=<UnsqueezeBackward0>),
 torch.Size([10, 1]))

# Training with random dataset

In [9]:
sdata = eu.datasets.random1000()

In [10]:
eu.pp.reverse_complement_data(sdata)

SeqData object modified:
	rev_seqs: None -> 1000 rev_seqs added


In [11]:
eu.pp.one_hot_encode_data(sdata)

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added


In [12]:
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = (1000,)
ohe_seqs = (1000, 66, 4)
ohe_rev_seqs = (1000, 66, 4)
seqs_annot: 'TARGETS'
pos_annot: PyRanges object with 1456 features

In [13]:
sdataset = sdata.to_dataset(label="TARGETS", seq_transforms=[], transform_kwargs={"transpose": True})

In [14]:
sdataset[0]

(tensor([115., 101., 113.,  48.,  48.,  49.,  36.]),
 tensor([[0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
          0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
          0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
          1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0.],
         [0., 1., 0., 0., 0., 1.

In [15]:
batch_size = 100

sdataloader = DataLoader(sdataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)

In [16]:
simple_deepbind_instance = DeepBind(input_len=x_len)

trainer = pl.Trainer(accelerator = "cpu", devices = 1)
trainer.fit(simple_deepbind_instance, sdataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name      | Type                      | Params
--------------------------------------------------------
0 | max_pool  | MaxPool1d                 | 0     
1 | avg_pool  | AvgPool1d                 | 0     
2 | convnet   | BasicConv1D               | 272   
3 | fcn       | BasicFullyConnectedModule | 146 K 
4 | r_squared | R2Score                   | 0     
--------------------------------------------------------
147 K     Trainable params
0         Non-trainable params
147 K     Total params
0.588     Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

In [None]:
simple_deepbind_instance

DeepBind(
  (max_pool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (avg_pool): AvgPool1d(kernel_size=(4,), stride=(4,), padding=(0,))
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(4,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): Dropout(p=0.2, inplace=False)
    )
  )
  (fcn): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=504, out_features=256, bias=True)
      (1): ReLU(inplace=True)
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=256, out_features=64, bias=True)
      (4): ReLU(inplace=True)
      (5): Dropout(p=0.2, inplace=False)
      (6): Linear(in_features=64, out_features=16, bias=True)
      (7): ReLU(inplace=True)
      (8): Dropout(p=0.2, inplace=False)
      (9): Linear(in_features=16, out_features=4, bias=True)
      (10): ReLU(inplace=True)
      (11): Dropout(p=0.2, inplace=False)
      (12): Linear(in_features=4, out_feature

---

# Scratch