# Testing EUGENE training

**Authorship:**
Adam Klie, *03/19/2022*
***
**Description:**
Notebook for testing the training of EUGENE architectures

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Config for EUGENES</li></b>
    </ul>
</div>

In [1]:
import numpy as np
import pandas as pd
import torch

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import sys
sys.path.append("../eugene")

# Random dataset
<div class="alert alert-info" role="alert">
  <b>Just test out the basic function of our eugene architectures</b>
</div>

## Instantiate EUGENE architecture: ssEUGENE

In [23]:
cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

In [25]:
from ssEUGENE import ssEUGENE

In [26]:
eugene = ssEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
eugene

ssEUGENE(
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(15,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (recurrentnet): BasicRecurrent(
    (module): LSTM(16, 32, batch_first=True)
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=32, out_features=1, bias=True)
    )
  )
)

In [28]:
x = torch.randn(10, 4, 66)
out = eugene(x)
out.shape

torch.Size([10, 1])

## Load data

In [29]:
from torch.utils.data import dataset
from torch.utils.data import DataLoader

### Training set

In [30]:
training_dataset = dataset.TensorDataset(torch.randn(1000, 4, 66), torch.bernoulli(torch.empty(1000).uniform_(0,1)))
training_dataloader = DataLoader(training_dataset, batch_size=32, num_workers=1)
training_dataset[0][0].shape, training_dataset[0][1]

(torch.Size([4, 66]), tensor(1.))

In [31]:
for i_batch, batch in enumerate(training_dataloader):
    x, y = batch
    outs = eugene(x)
    print(x.shape, y.shape, outs.shape)
    if i_batch==3:
        break

torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])
torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])
torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])
torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])


### Validation set

In [32]:
validation_dataset = dataset.TensorDataset(torch.randn(100, 4, 66), torch.bernoulli(torch.empty(100).uniform_(0,1)))
validation_dataloader = DataLoader(validation_dataset, batch_size=32, num_workers=4)
validation_dataset[0][0].shape, validation_dataset[0][1]

(torch.Size([4, 66]), tensor(1.))

In [33]:
for i_batch, batch in enumerate(validation_dataloader):
    x, y = batch
    outs = eugene(x)
    print(x.shape, y.shape, outs.shape)
    if i_batch==3:
        break

torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])
torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])
torch.Size([32, 4, 66]) torch.Size([32]) torch.Size([32, 1])
torch.Size([4, 4, 66]) torch.Size([4]) torch.Size([4, 1])


## Training with PyTorch Lightning

In [34]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [35]:
logger = TensorBoardLogger("random_test", name="ssEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [36]:
trainer.fit(model=eugene, train_dataloader=training_dataloader, val_dataloaders=validation_dataloader)

  "`trainer.fit(train_dataloader)` is deprecated in v1.4 and will be removed in v1.6."
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Missing logger folder: random_test/ssEUGENE
Set SLURM handle signals.

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 976   
1 | recurrentnet | BasicRecurrent            | 6.4 K 
2 | fcnet        | BasicFullyConnectedModule | 33    
-----------------------------------------------------------
7.4 K     Trainable params
0         Non-trainable params
7.4 K     Total params
0.030     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# MPRADataset

## Instantiate EUGENE architecture: dsEUGENE

In [2]:
from dsEUGENE import dsEUGENE
import claim.utils as cu

In [3]:
cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
cu.init_weights(eugene)



## Load data

In [4]:
from load_data import load_csv, load_numpy
from MPRADataset import MPRADataset
from torch.utils.data import DataLoader
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

TRAIN_SEQ = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/seqs/0.09-0.4_seqs-train-0.9.txt"
TRAIN_LABEL = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-train-0.9_binary.txt"
train_seqs, train_targets = load_numpy(TRAIN_SEQ, TRAIN_LABEL, is_seq_text=True)
len(train_seqs), train_seqs[0], len(train_targets), train_targets[0]

VAL_SEQ = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/seqs/0.09-0.4_seqs-test-0.1.txt"
VAL_LABEL = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-test-0.1_binary.txt"
val_seqs, val_targets = load_numpy(VAL_SEQ, VAL_LABEL, is_seq_text=True)
len(val_seqs), val_seqs[0], len(val_targets), val_targets[0]

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=True)
])

# Instantiate a Dataset
train_dataset = MPRADataset(train_seqs, train_targets, transform=data_transform)
val_dataset = MPRADataset(val_seqs, val_targets, transform=data_transform)

# Instantiate a DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=4)

for i_batch, batch in enumerate(train_dataloader):
    x, x_rev_comp, y = batch["sequence"], batch["reverse_complement"], batch["target"]
    outs = eugene(x, x_rev_comp)
    print(x.shape, x_rev_comp.shape, y.shape, outs.shape)
    if i_batch==3:
        break
        
for i_batch, batch in enumerate(val_dataloader):
    x, x_rev_comp, y = batch["sequence"], batch["reverse_complement"], batch["target"]
    outs = eugene(x, x_rev_comp)
    print(x.shape, x_rev_comp.shape, y.shape, outs.shape)
    if i_batch==3:
        break

torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])


## Training with PyTorch Lightning

In [5]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [6]:
logger = TensorBoardLogger("0.18-0.4_test", name="dsEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [7]:
trainer.fit(model=eugene, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

  "`trainer.fit(train_dataloader)` is deprecated in v1.4 and will be removed in v1.6."
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 976   
1 | recurrentnet | BasicRecurrent            | 8.4 K 
2 | fcnet        | BasicFullyConnectedModule | 33    
3 | accuracy     | Accuracy                  | 0     
4 | auroc        | AUROC                     | 0     
-----------------------------------------------------------
9.5 K     Trainable params
0         Non-trainable params
9.5 K     Total params
0.038     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]



Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# MPRADataModule

In [2]:
from MPRADataModule import MPRADataModule

## Load data

In [3]:
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

In [4]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=True)
])

In [5]:
OLS_TSV = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
mod = MPRADataModule(seq_file=OLS_TSV,
                     transform=data_transform,
                     num_workers=4,
                     batch_size=512,
                     load_kwargs=dict(target_col="ACTIVITY_SUMRNA_NUMDNA", low_thresh=0.18, high_thresh=0.4))

## Instantiate EUGENE architecture: dsEUGENE

In [1]:
import claim.utils as cu
from dsEUGENE import dsEUGENE

ModuleNotFoundError: No module named 'dsEUGENE'

In [7]:
cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

In [8]:
eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
cu.init_weights(eugene)
eugene



dsEUGENE(
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(15,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (recurrentnet): BasicRecurrent(
    (module): LSTM(32, 32, batch_first=True)
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=32, out_features=1, bias=True)
    )
  )
  (accuracy): Accuracy()
  (auroc): AUROC()
)

## Training with PyTorch Lightning

In [9]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [10]:
logger = TensorBoardLogger("0.18-0.4_test", name="dsEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=2, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [11]:
trainer.fit(model=eugene, datamodule=mod)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 976   
1 | recurrentnet | BasicRecurrent            | 8.4 K 
2 | fcnet        | BasicFullyConnectedModule | 33    
3 | accuracy     | Accuracy                  | 0     
4 | auroc        | AUROC                     | 0     
-----------------------------------------------------------
9.5 K     Trainable params
0         Non-trainable params
9.5 K     Total params
0.038     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# References