# New Kopp21CNN model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import eugene.models.base._layers as layers
import eugene.models.base._blocks as blocks
import eugene.models.base._towers as towers

class Kopp21CNN(nn.Module):
    """
    Custom convolutional model used in Kopp et al. 2021 paper

    PyTorch implementation of the TensorFlow model described here:
    https://github.com/wkopp/janggu_usecases/tree/master/01_jund_prediction

    This model can only be run in "ds" mode. The reverse complement must be included in the Dataloader
    Parameters
    ----------
    input_len : int
        Length of the input sequence.
    output_dim : int
        Dimension of the output.
    strand : str, optional
        Strand of the input. This model is only implemented for "ds"
    task : str, optional
        Task for this model. By default "binary_classification" for this mode
    aggr : str, optional
        Aggregation method. Either "concat", "max", or "avg". By default "max" for this model.
    filters : list, optional
        Number of filters in the convolutional layers. 
    conv_kernel_size : list, optional
        Kernel size of the convolutional layers.
    maxpool_kernel_size : int, optional
        Kernel size of the maxpooling layer.
    stride : int, optional
        Stride of the convolutional layers.
    """
    def __init__(
        self,
        input_len: int,
        output_dim: int,
        aggr: str = "max",
        filters: list = [10, 8],
        conv_kernel_size: list = [11, 3],
        maxpool_kernel_size: int = 30,
        stride: int = 1,
        dropout_rate: float = 0.0,
    ):
        super(Kopp21CNN, self).__init__()

        # Set the attributes
        self.input_len = input_len
        self.output_dim = output_dim
        self.aggr = aggr
        self.revcomp = layers.RevComp()
        self.dropout = nn.Dropout(dropout_rate)
        self.conv = nn.Conv1d(4, filters[0], conv_kernel_size[0], stride=stride)
        self.relu = nn.ReLU(inplace=False)
        self.maxpool = nn.MaxPool1d(kernel_size=maxpool_kernel_size, stride=stride)
        self.batchnorm = nn.BatchNorm1d(filters[0])
        self.conv2 = nn.Conv1d(filters[0], filters[1], conv_kernel_size[1], stride=stride)
        self.relu2 = nn.ReLU(inplace=False)        
        self.batchnorm2 = nn.BatchNorm1d(filters[1])
        self.linear = nn.Linear(filters[1], self.output_dim)

    def forward(self, x):
        x = self.dropout(x)
        x_rev_comp = self.revcomp(x)
        x_fwd = self.conv(x)
        x_fwd = self.relu(x_fwd)
        x_rev_comp = self.conv(x_rev_comp)
        x_rev_comp = self.relu(x_rev_comp)
        if self.aggr == "concat":
            x = torch.cat((x_fwd, x_rev_comp), dim=2)
        elif self.aggr == "max":
            x = torch.max(x_fwd, x_rev_comp)
        elif self.aggr == "avg":
            x = (x_fwd + x_rev_comp) / 2
        elif self.aggr is None:
            x = torch.cat((x_fwd, x_rev_comp), dim=1)
        x = self.maxpool(x)
        x = self.batchnorm(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = F.max_pool1d(x, x.shape[2])
        x = self.batchnorm2(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

  pkg_resources.require(self.requirement)
  pkg_resources.require(self.requirement)


In [2]:
import torch
import sys
sys.path.append("/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/scripts/kopp21")
from kopp21_helpers import Kopp21CNN

In [5]:
import os
import yaml
import importlib
from eugene import settings, models
settings.config_dir = "/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/configs/kopp21"

def load_config_nn(
    config_path, 
    **kwargs
):
    # If config path is just a filename, assume it's in the default config directory
    if "/" not in config_path:
        config_path = os.path.join(settings.config_dir, config_path)
    with open(config_path, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    module_name = config.pop("module")
    model_params = config.pop("model")
    arch_name = model_params["arch_name"]
    arch = model_params["arch"]
    model_type = getattr(importlib.import_module("kopp21_helpers"), arch_name)
    model = model_type(**arch)
    module_type = getattr(importlib.import_module("eugene.models"), module_name)
    module = module_type(model, **config, **kwargs)
    return module

In [6]:
model = load_config_nn("kopp21_cnn_nn.yaml", seed=13)

[rank: 0] Global seed set to 13


In [13]:
x = torch.randn(10, 4, 1000)

In [14]:
model(x).shape

torch.Size([10, 1])

In [8]:
import torchinfo

In [9]:
torchinfo.summary(model, input_size=(10, 4, 170))

Layer (type:depth-idx)                   Output Shape              Param #
SequenceModule                           [10, 1]                   --
├─Kopp21CNN: 1-1                         [10, 1]                   --
│    └─Dropout: 2-1                      [10, 4, 170]              --
│    └─RevComp: 2-2                      [10, 4, 170]              --
│    └─Conv1d: 2-3                       [10, 10, 160]             450
│    └─ReLU: 2-4                         [10, 10, 160]             --
│    └─Conv1d: 2-5                       [10, 10, 160]             (recursive)
│    └─ReLU: 2-6                         [10, 10, 160]             --
│    └─MaxPool1d: 2-7                    [10, 10, 131]             --
│    └─BatchNorm1d: 2-8                  [10, 10, 131]             20
│    └─Conv1d: 2-9                       [10, 8, 129]              248
│    └─ReLU: 2-10                        [10, 8, 129]              --
│    └─BatchNorm1d: 2-11                 [10, 8, 1]                16
│   

In [10]:
model2 = models.load_config("kopp21_cnn.yaml", seed=13)

[rank: 0] Global seed set to 13


In [11]:
torchinfo.summary(model2, input_size=(10, 4, 170))

Layer (type:depth-idx)                   Output Shape              Param #
SequenceModule                           [10, 1]                   --
├─Kopp21CNN: 1-1                         [10, 1]                   20
│    └─RevComp: 2-1                      [10, 4, 170]              --
│    └─Conv1d: 2-2                       [10, 10, 160]             450
│    └─Conv1d: 2-3                       [10, 10, 160]             (recursive)
│    └─MaxPool1d: 2-4                    [10, 10, 131]             --
│    └─Conv1d: 2-5                       [10, 8, 129]              248
│    └─BatchNorm1d: 2-6                  [10, 8, 1]                16
│    └─Linear: 2-7                       [10, 1]                   9
Total params: 743
Trainable params: 743
Non-trainable params: 0
Total mult-adds (M): 1.76
Input size (MB): 0.03
Forward/backward pass size (MB): 0.34
Params size (MB): 0.00
Estimated Total Size (MB): 0.37

# Old stuff

In [178]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload 
%autoreload 2

In [1]:
import os
import logging
import torch
import numpy as np
import pandas as pd
import eugene as eu

Global seed set to 13
2023-06-10 11:16:05.999419: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-10 11:16:06.557977: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
2023-06-10 11:16:06.558032: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-10 11:16:11.255854: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynami

In [2]:
# Configure EUGENe 
eu.settings.dataset_dir = "/cellar/users/aklie/data/eugene/kopp21"

In [3]:
sdata = eu.dl.read_h5sd(filename=os.path.join(eu.settings.dataset_dir, "jund_train_processed.h5sd"))
sdata

SeqData object with = 948771 seqs
seqs = (948771,)
names = (948771,)
rev_seqs = None
ohe_seqs = (948771, 4, 500)
ohe_rev_seqs = (948771, 4, 500)
seqs_annot: 'chr', 'end', 'start', 'target', 'train_test', 'train_val'
pos_annot: None
seqsm: None
uns: None

# RevComp working?

In [69]:
import os
from eugene import settings, models
from eugene.models import zoo
import torch
import seqdata as sd
import seqpro as sp
import numpy as np
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/kopp21"
settings.config_dir = "/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/configs/kopp21"

In [4]:
sdata = sd.open_zarr(os.path.join(settings.dataset_dir, 'kopp21_train.zarr'))
sdata

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type object numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type int64 numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type int64 numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,904.82 MiB,1.81 MiB
Shape,"(948771, 1, 500)","(29650, 1, 32)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray
"Array Chunk Bytes 904.82 MiB 1.81 MiB Shape (948771, 1, 500) (29650, 1, 32) Dask graph 512 chunks in 2 graph layers Data type uint16 numpy.ndarray",500  1  948771,

Unnamed: 0,Array,Chunk
Bytes,904.82 MiB,1.81 MiB
Shape,"(948771, 1, 500)","(29650, 1, 32)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.77 GiB,3.56 MiB
Shape,"(948771, 500, 4)","(59299, 63, 1)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 1.77 GiB 3.56 MiB Shape (948771, 500, 4) (59299, 63, 1) Dask graph 512 chunks in 2 graph layers Data type uint8 numpy.ndarray",4  500  948771,

Unnamed: 0,Array,Chunk
Bytes,1.77 GiB,3.56 MiB
Shape,"(948771, 500, 4)","(59299, 63, 1)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,452.41 MiB,1.81 MiB
Shape,"(948771, 500)","(59299, 32)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 452.41 MiB 1.81 MiB Shape (948771, 500) (59299, 32) Dask graph 256 chunks in 2 graph layers Data type |S1 numpy.ndarray",500  948771,

Unnamed: 0,Array,Chunk
Bytes,452.41 MiB,1.81 MiB
Shape,"(948771, 500)","(59299, 32)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type object numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 0.90 MiB 231.63 kiB Shape (948771,) (237193,) Dask graph 4 chunks in 2 graph layers Data type uint8 numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 0.90 MiB 231.63 kiB Shape (948771,) (237193,) Dask graph 4 chunks in 2 graph layers Data type bool numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 0.90 MiB 231.63 kiB Shape (948771,) (237193,) Dask graph 4 chunks in 2 graph layers Data type bool numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray


In [128]:
def prep_new_model(
    config,
    seed,
):
    # Instantiate the model
    model = models.load_config(
        config_path=config,
        seed=seed
    )
    
    # Initialize the model prior to conv filter initialization
    models.init_weights(model)

    # Return the model
    return model 

In [129]:
config = "kopp21_cnn.yaml"
model = prep_new_model(config=config, seed=0)
model_name = config.split('.')[0]
print(model_name)

[rank: 0] Global seed set to 0


kopp21_cnn


In [134]:
from seqexplainer import get_layer_outputs
import torch.nn as nn

In [153]:
transforms = {"target": lambda x: torch.tensor(x, dtype=torch.float32)}
transforms["ohe_seq"] = lambda x: torch.tensor(x, dtype=torch.float32).swapaxes(1, 2)

In [154]:
ohe_seqs = sdata["ohe_seq"][:10].values
ohe_seqs_tensor = transforms["ohe_seq"](ohe_seqs)
sp.decode_ohe(ohe_seqs_tensor.numpy().astype(np.uint8), ohe_axis=1, alphabet=sp.alphabets.DNA)

array([[b'T', b'T', b'T', ..., b'G', b'A', b'G'],
       [b'A', b'T', b'T', ..., b'C', b'A', b'T'],
       [b'A', b'A', b'T', ..., b'A', b'T', b'G'],
       ...,
       [b'T', b'T', b'T', ..., b'A', b'G', b'T'],
       [b'C', b'C', b'G', ..., b'T', b'G', b'G'],
       [b'T', b'A', b'A', ..., b'T', b'C', b'C']], dtype='|S1')

In [155]:
class RevComp(nn.Module):
    def __init__(self, dim=[1,2]):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        return x.flip(self.dim)

In [156]:
rc_layer = RevComp()
rc_ohe_seqs_tensor = rc_layer(ohe_seqs_tensor)
sp.decode_ohe(rc_ohe_seqs_tensor.numpy().astype(np.uint8), ohe_axis=1, alphabet=sp.alphabets.DNA)

array([[b'C', b'T', b'C', ..., b'A', b'A', b'A'],
       [b'A', b'T', b'G', ..., b'A', b'A', b'T'],
       [b'C', b'A', b'T', ..., b'A', b'T', b'T'],
       ...,
       [b'A', b'C', b'T', ..., b'A', b'A', b'A'],
       [b'C', b'C', b'A', ..., b'C', b'G', b'G'],
       [b'G', b'G', b'A', ..., b'T', b'T', b'A']], dtype='|S1')

In [157]:
import eugene.models.base._layers as layers

In [158]:
rc_layer_eugene = layers.RevComp()
rc_ohe_seqs_tensor = rc_layer_eugene(ohe_seqs_tensor)
sp.decode_ohe(rc_ohe_seqs_tensor.numpy().astype(np.uint8), ohe_axis=1, alphabet=sp.alphabets.DNA)

array([[b'C', b'T', b'C', ..., b'A', b'A', b'A'],
       [b'A', b'T', b'G', ..., b'A', b'A', b'T'],
       [b'C', b'A', b'T', ..., b'A', b'T', b'T'],
       ...,
       [b'A', b'C', b'T', ..., b'A', b'A', b'A'],
       [b'C', b'C', b'A', ..., b'C', b'G', b'G'],
       [b'G', b'G', b'A', ..., b'T', b'T', b'A']], dtype='|S1')

In [159]:
outs = get_layer_outputs(
    model,
    inputs=transforms["ohe_seq"](ohe_seqs),
    layer_name="arch.revcomp"
)

Computing layer outputs for layer arch.revcomp on batches of size 32:   0%|          | 0/1 [00:00<?, ?it/s]

In [160]:
sp.decode_ohe(outs.astype(np.uint8), ohe_axis=1, alphabet=sp.alphabets.DNA)

array([[b'C', b'T', b'C', ..., b'A', b'A', b'A'],
       [b'A', b'T', b'G', ..., b'A', b'A', b'T'],
       [b'C', b'A', b'T', ..., b'A', b'T', b'T'],
       ...,
       [b'A', b'C', b'T', ..., b'A', b'A', b'A'],
       [b'C', b'C', b'A', ..., b'C', b'G', b'G'],
       [b'G', b'G', b'A', ..., b'T', b'T', b'A']], dtype='|S1')

In [163]:
from eugene.dataload._augment import RandomRC
random_rc = RandomRC(rc_prob=1)
def ohe_seq_transform(x):
    x = torch.tensor(x, dtype=torch.float32).swapaxes(1, 2)
    return random_rc(x)
transforms["ohe_seq"] = ohe_seq_transform
transforms

{'target': <function __main__.<lambda>(x)>,
 'ohe_seq': <function __main__.ohe_seq_transform(x)>}

In [164]:
transformed_ohe_seqs = transforms["ohe_seq"](ohe_seqs).numpy().astype(np.uint8)

In [165]:
sp.decode_ohe(transformed_ohe_seqs, ohe_axis=1, alphabet=sp.alphabets.DNA)

array([[b'C', b'T', b'C', ..., b'A', b'A', b'A'],
       [b'A', b'T', b'G', ..., b'A', b'A', b'T'],
       [b'C', b'A', b'T', ..., b'A', b'T', b'T'],
       ...,
       [b'A', b'C', b'T', ..., b'A', b'A', b'A'],
       [b'C', b'C', b'A', ..., b'C', b'G', b'G'],
       [b'G', b'G', b'A', ..., b'T', b'T', b'A']], dtype='|S1')

In [26]:
next(iter(dl))

{'ohe_seq': tensor([[[0., 0., 0.,  ..., 0., 0., 1.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 1., 1., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.]],
 
         [[0., 1., 0.,  ..., 0., 1., 1.],
          [1., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.]],
 
         [[1., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 1., 1.],
          [0., 1., 1.,  ..., 1., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 1., 0., 0.],
          [0., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [1., 0., 0.,  ..., 0., 1., 1.]],
 
         [[0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 1., 1.],
          [1., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 1., 1.],
          [0., 1., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0

# dsFCN, dsCNN, dsRNN, dsHybrid

In [1]:
import os
import yaml
from eugene import settings, models
settings.config_dir = "/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/configs/kopp21"

  pkg_resources.require(self.requirement)
  pkg_resources.require(self.requirement)


In [2]:
import sys
sys.path.append("/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/scripts/kopp21")
from kopp21_helpers import dsFCN, dsCNN, dsRNN, dsHybrid

In [6]:
from pytorch_lightning import seed_everything
# Function to instantiate a new model
def prep_new_model(
    config,
    seed,
):
    # Load in the arch
    with open(config, 'r') as f:
        config = yaml.safe_load(f)
    
    # Set seed
    seed_everything(seed)
    
    # Initialize the model
    arch = dsRNN(**config["arch"])
    models.init_weights(arch)
    model = models.SequenceModule(
        arch=arch,
        task="binary_classification",
        loss_fxn="bce",
        optimizer="adam",
        optimizer_lr=0.001,
        scheduler="reduce_lr_on_plateau",
        scheduler_monitor="val_loss_epoch",
        scheduler_kwargs={"patience": 2}
    )

    # Return the model
    return model 

In [7]:
config = "dsrnn.yaml"

In [8]:
model = prep_new_model(os.path.join(settings.config_dir, config), seed=0)

[rank: 0] Global seed set to 0


In [9]:
model

SequenceModule(
  (arch): dsRNN(
    (revcomp): RevComp()
    (recurrent_block): RecurrentBlock(
      (layers): LSTM(4, 128, batch_first=True)
    )
    (dense_block): DenseBlock(
      (layers): Sequential(
        (0): Linear(in_features=256, out_features=64, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.2, inplace=False)
        (3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): Linear(in_features=64, out_features=1, bias=True)
      )
    )
  )
  (train_metric): BinaryAUROC()
  (val_metric): BinaryAUROC()
  (test_metric): BinaryAUROC()
)

In [9]:
model.summary()

Model: dsHybrid
Sequence length: 500
Output dimension: 1
Task: binary_classification
Loss function: binary_cross_entropy_with_logits
Optimizer: Adam
	Optimizer parameters: {}
	Optimizer starting learning rate: 0.001
Scheduler: ReduceLROnPlateau
	Scheduler parameters: {'patience': 2}
Metric: auroc
	Metric parameters: {'task': 'binary'}
Seed: None
Parameters summary:


  | Name         | Type        | Params
---------------------------------------------
0 | arch         | dsHybrid    | 734   
1 | train_metric | BinaryAUROC | 0     
2 | val_metric   | BinaryAUROC | 0     
3 | test_metric  | BinaryAUROC | 0     
---------------------------------------------
734       Trainable params
0         Non-trainable params
734       Total params
0.003     Total estimated model params size (MB)

In [200]:
model = dsCNN(
    input_len=500,
    output_dim=1,
    conv_kwargs={
        "input_channels": 4,
        "conv_channels": [10, 8],
        "conv_kernels": [11, 3],
        "conv_strides": [1, 1],
        "pool_types": ["max", None],
        "pool_kernels": [30, None],
        "pool_strides": [1, None],
        "dropout_rates": 0.2,
        "batchnorm": True,
        "activations": "relu"
    },
    dense_kwargs={
        "hidden_dims": [64],
        "dropout_rates": 0.2,
        "batchnorm": True
    },
    aggr="concat"
)

In [201]:
model(transforms["ohe_seq"](ohe_seqs))

tensor([[-0.3172],
        [-0.0057],
        [-0.4521],
        [ 0.5611],
        [ 0.2342],
        [ 0.5530],
        [-0.1829],
        [-0.1716],
        [ 0.2026],
        [ 0.2020]], grad_fn=<DivBackward0>)

In [202]:
model.summary()

AttributeError: 'dsCNN' object has no attribute 'summary'

In [203]:
import torchinfo

In [204]:
torchinfo.summary(model, input_size=(1, 4, 500))

Layer (type:depth-idx)                   Output Shape              Param #
dsCNN                                    [1, 1]                    --
├─RevComp: 1-1                           [1, 4, 500]               --
├─Conv1DTower: 1-2                       [1, 8, 459]               --
│    └─Sequential: 2-1                   [1, 8, 459]               --
│    │    └─Conv1d: 3-1                  [1, 10, 490]              450
│    │    └─ReLU: 3-2                    [1, 10, 490]              --
│    │    └─MaxPool1d: 3-3               [1, 10, 461]              --
│    │    └─Dropout: 3-4                 [1, 10, 461]              --
│    │    └─BatchNorm1d: 3-5             [1, 10, 461]              20
│    │    └─Conv1d: 3-6                  [1, 8, 459]               248
│    │    └─ReLU: 3-7                    [1, 8, 459]               --
│    │    └─Dropout: 3-8                 [1, 8, 459]               --
│    │    └─BatchNorm1d: 3-9             [1, 8, 459]               16
├─Conv1DTower

In [207]:
from eugene import models

In [208]:
def prep_new_model(
    config,
    seed,
):
    # Instantiate the model
    model = models.load_config(
        config_path=config,
        seed=seed
    )
    
    # Initialize the model prior to conv filter initialization
    models.init_weights(model)

    # Return the model
    return model 

In [209]:
model = prep_new_model("hybrid.yaml", seed=0)

[rank: 0] Global seed set to 0


In [210]:
model

SequenceModule(
  (arch): Hybrid(
    (conv1d_tower): Conv1DTower(
      (layers): Sequential(
        (0): Conv1d(4, 10, kernel_size=(11,), stride=(1,), padding=valid)
        (1): ReLU()
        (2): MaxPool1d(kernel_size=30, stride=1, padding=0, dilation=1, ceil_mode=False)
        (3): Dropout(p=0.2, inplace=False)
        (4): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (5): Conv1d(10, 8, kernel_size=(3,), stride=(1,), padding=valid)
        (6): ReLU()
        (7): Dropout(p=0.2, inplace=False)
        (8): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (recurrent_block): RecurrentBlock(
      (layers): LSTM(8, 128, batch_first=True)
    )
    (dense_block): DenseBlock(
      (layers): Sequential(
        (0): Linear(in_features=128, out_features=64, bias=True)
        (1): ReLU()
        (2): Dropout(p=0.2, inplace=False)
        (3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True

In [211]:
import yaml

In [212]:
with open("/cellar/users/aklie/projects/ML4GLand/EUGENe_paper/configs/kopp21/dscnn.yaml", "r") as f:
    config = yaml.safe_load(f)

In [213]:
config

{'arch': {'input_len': 500,
  'output_dim': 1,
  'aggr': 'concat',
  'conv_kwargs': {'input_channels': 4,
   'conv_channels': [10, 8],
   'conv_kernels': [11, 3],
   'conv_strides': [1, 1],
   'pool_types': ['max', None],
   'pool_kernels': [30, None],
   'pool_strides': [1, None],
   'dropout_rates': 0.2,
   'batchnorm': True,
   'activations': 'relu'},
  'dense_kwargs': {'hidden_dims': [64],
   'dropout_rates': 0.2,
   'batchnorm': True}}}

In [None]:
task: binary_classification
loss_fxn: bce
optimizer: adam
optimizer_kwargs:
  amsgrad: true
optimizer_lr: 0.001
scheduler: reduce_lr_on_plateau
scheduler_monitor: val_loss_epoch
scheduler_kwargs:
  patience: 2

In [219]:
model = models.SequenceModule(
    arch=arch,
    task="binary_classification",
    loss_fxn="bce",
    optimizer="adam",
    optimizer_lr=0.001,
    scheduler="reduce_lr_on_plateau",
    scheduler_monitor="val_loss_epoch",
    scheduler_kwargs={"patience": 2}
)

In [221]:
model.summary()

Model: dsCNN
Sequence length: 500
Output dimension: 1
Task: binary_classification
Loss function: binary_cross_entropy_with_logits
Optimizer: Adam
	Optimizer parameters: {}
	Optimizer starting learning rate: 0.001
Scheduler: ReduceLROnPlateau
	Scheduler parameters: {'patience': 2}
Metric: auroc
	Metric parameters: {'task': 'binary'}
Seed: None
Parameters summary:


  | Name         | Type        | Params
---------------------------------------------
0 | arch         | dsCNN       | 471 K 
1 | train_metric | BinaryAUROC | 0     
2 | val_metric   | BinaryAUROC | 0     
3 | test_metric  | BinaryAUROC | 0     
---------------------------------------------
471 K     Trainable params
0         Non-trainable params
471 K     Total params
1.884     Total estimated model params size (MB)