# Testing `ResidualBind` model class

**Authorship:**
Adam Klie, *11/05/2022*
***
**Description:**
Notebook for testing out the custom `ResidualBind` model class.

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import eugene as eu

Global seed set to 13


In [3]:
class DanQ(nn.Module):
    """DanQ model from Quang and Xie, 2016; 
        see <https://academic.oup.com/nar/article/44/11/e107/2468300> 
        and <https://github.com/uci-cbcl/DanQ/blob/master/DanQ_train.py>
        and <https://github.com/FunctionLab/selene/blob/master/models/danQ.py>
    """
    def __init__(self, output_dim, d=320,
                 conv1_filters=None, learn_conv1_filters=True):
        super().__init__()
        
        if d != 320:
            print("NB: number of convolutional filters in original DanQ model is 320; current number of convolutional filters is not set to 320")
        
        self.activation = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.dropout5 = nn.Dropout(0.5)
        self.flatten = nn.Flatten()
        
        self.init_conv1_filters = conv1_filters
        
        assert (not (conv1_filters is None and not learn_conv1_filters)), "initial conv1_filters cannot be set to None while learn_conv1_filters is set to False"
        
        # Layer 1 (convolutional), constituent parts
        if conv1_filters is not None:
            if learn_conv1_filters: # continue modifying existing conv1_filters through learning
                self.conv1_filters = torch.nn.Parameter( torch.Tensor(conv1_filters) )
            else:
                self.register_buffer("conv1_filters", torch.Tensor(conv1_filters))
        else:
            self.conv1_filters = torch.nn.Parameter(torch.zeros(d, 4, 26))
            torch.nn.init.kaiming_normal_(self.conv1_filters)
        self.activation1 = nn.ReLU() # name the first-layer activation function for hook purposes
        self.maxpool1 = nn.MaxPool1d(13)
        
        # Layer 2 (bi-directional LSTM), constituent parts
        self.bdlstm2 = nn.LSTM(d, d, num_layers=1, batch_first=True, bidirectional=True)
        
        # Layer 3 (fully connected), constituent parts
        self.fc3 = nn.LazyLinear(925, bias=False)
        
        # Output layer (fully connected), constituent parts
        self.fc4 = nn.Linear(925, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def get_which_conv_layers_transferred(self):
        layers = []
        if self.init_conv1_filters is not None:
            layers.append(1)
        return layers
    
    def forward(self, x):
        # Layer 1
        out = torch.conv1d(x, self.conv1_filters, stride=1, padding=(self.conv1_filters.shape[-1]//2))
        out = self.activation1(out)
        out = self.maxpool1(out)
        out = self.dropout2(out)
        
        # Layer 2
        out = torch.transpose(out, 1, 2) # make dims (batch, seq, features) to comply with bi-dir. LSTM
        out, _ = self.bdlstm2(out) # see <https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html>
        out = self.dropout5(out)
        out = torch.transpose(out, 1, 2) # change dims back to (batch, features, seq)
        
        # Layer 3
        out = self.flatten(out)
        out = self.fc3(out)
        out = self.activation(out)
        
        # Output layer
        out = self.fc4(out) 
        y_pred = self.sigmoid(out)
        
        return y_pred

In [None]:
From selele
class DanQ(nn.Module):
    def __init__(self, sequence_length, n_genomic_features):
        """
        Parameters
        ----------
        sequence_length : int
            Input sequence length
        n_genomic_features : int
            Total number of features to predict
        """
        super(DanQ, self).__init__()
        self.nnet = nn.Sequential(
            nn.Conv1d(4, 320, kernel_size=26),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(
                kernel_size=13, stride=13),
            nn.Dropout(0.2))

        self.bdlstm = nn.Sequential(
            nn.LSTM(
                320, 320, num_layers=1, batch_first=True, bidirectional=True))

        self._n_channels = math.floor(
            (sequence_length - 25) / 13)
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self._n_channels * 640, 925),
            nn.ReLU(inplace=True),
            nn.Linear(925, n_genomic_features),
            nn.Sigmoid())

    def forward(self, x):
        """Forward propagation of a batch.
        """
        out = self.nnet(x)
        reshape_out = out.transpose(0, 1).transpose(0, 2)
        out, _ = self.bdlstm(reshape_out)
        out = out.transpose(0, 1)
        reshape_out = out.contiguous().view(
            out.size(0), 640 * self._n_channels)
        predict = self.classifier(reshape_out)
        return predict

def criterion():
    return nn.BCELoss()

def get_optimizer(lr):

In [4]:
model = Basset(2)



In [5]:
x = torch.randn(10, 4, 100)
model(x)

tensor([[0.5405, 0.4398],
        [0.6291, 0.5908],
        [0.5014, 0.6883],
        [0.3922, 0.7281],
        [0.5423, 0.5404],
        [0.4932, 0.5351],
        [0.5355, 0.4664],
        [0.4262, 0.4420],
        [0.4865, 0.6399],
        [0.4958, 0.5314]], grad_fn=<SigmoidBackward0>)

In [25]:
sdata = eu.datasets.random1000()
eu.pp.ohe_seqs_sdata(sdata)
eu.pp.train_test_split_sdata(sdata)

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
SeqData object modified:
    seqs_annot:
        + train_val


In [26]:
eu.train.fit(model, sdata, target_keys="activity_0", epochs=1, batch_size=32)

Global seed set to 13
Missing logger folder: /workspaces/EUGENe/tests/notebooks/implement/models/eugene_logs/ssResidualBind_regression
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name           | Type                      | Params
-------------------------------------------------------------
0 | hp_metric      | R2Score                   | 0     
1 | conv           | BasicConv1D               | 4.5 K 
2 | residual_block | ResidualModule            | 83.8 K
3 | average_pool   | AvgPool1d                 | 0     
4 | dropout        | Dropout                   | 0     
5 | flatten        | Flatten                   | 0     
6 | fc             | BasicFullyConnectedModule | 2.0 M 
-------------------------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.320     Total estimated model params size (MB)


Dropping 0 sequences with NaN targets.
No transforms given, assuming just need to tensorize.
No transforms given, assuming just need to tensorize.


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 13
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]