# Testing EUGENE training

**Authorship:**
Adam Klie, *03/19/2022*
***
**Description:**
Notebook for testing the training of EUGENE architectures

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li></li></b>
    </ul>
</div>

# Set-up

In [1]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [2]:
from eugene.dataloading.SeqDataModule import SeqDataModule

# Benchmark CNN

In [3]:
BATCH_SIZE = 32
DATA_TYPE = "tsv"
NUM_WORKERS = 0
NUM_SEQS = 100
SEQ_LEN = 66

# Load data

In [4]:
from torchvision import transforms
from eugene.utils.seq_transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

In [5]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), 
    ReverseComplement(ohe_encoded=False), 
    OneHotEncode(), 
    ToTensor(transpose=True)
])

In [14]:
DATA_DIR = "test_{}seqs_{}/".format(NUM_SEQS, SEQ_LEN)
mod = SeqDataModule(seq_file=DATA_DIR + "test_seqs.tsv",
                    transform=data_transform,
                    num_workers=4,
                    batch_size=BATCH_SIZE,
                    load_kwargs=dict(seq_col="SEQ", target_col="LABEL"))
#load_kwargs=dict(target_file=DATA_DIR + "test_labels.npy", rev_seq_file=DATA_DIR + "test_rev_ohe_seqs.npy"))

## Instantiate architecture: CNN

In [20]:
import claim.utils as cu
from eugene.models.CNN import CNN

In [21]:
cnn=dict(input_len=66, channels=[4, 16, 32], conv_kernels=[15, 5], pool_kernels=[1, 1])
fc=dict(output_dim=1)

In [22]:
cnn = CNN(ds=True, classification=True, conv_kwargs=cnn, fc_kwargs=fc)
cu.init_weights(cnn)
cnn

CNN(
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(15,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
      (3): Conv1d(16, 32, kernel_size=(5,), stride=(1,))
      (4): ReLU(inplace=True)
      (5): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=3072, out_features=1, bias=True)
    )
  )
  (accuracy): Accuracy()
  (auroc): AUROC()
)

In [23]:
next(cnn.parameters())[0]

tensor([[ 0.1258,  0.0939, -0.0455, -0.0200,  0.0408, -0.0145, -0.0962,  0.0621,
          0.0158, -0.1183, -0.0346,  0.0850,  0.1175,  0.0606,  0.1100],
        [ 0.0030,  0.0571, -0.0182, -0.0116,  0.1212, -0.0068, -0.0711, -0.1224,
          0.0619, -0.0199,  0.1055,  0.0164,  0.0069, -0.1041, -0.0121],
        [ 0.0337, -0.1199, -0.1241,  0.0143, -0.0352,  0.0982, -0.0180, -0.1183,
          0.1008,  0.0295,  0.0162, -0.0489,  0.0807,  0.1160, -0.0924],
        [ 0.1148,  0.0412, -0.0034, -0.1257,  0.0435,  0.0790, -0.1144,  0.0049,
          0.0803, -0.0277, -0.0420,  0.0491, -0.0300, -0.1002, -0.1010]],
       grad_fn=<SelectBackward0>)

## Train (and time) with PyTorch Lightning

In [24]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [25]:
LOG_DIR = "../test/test_logs/batch_size-{0}.num_workers-{1}.data_type-{2}.num_seq-{3}.seq_len-{4}".format(BATCH_SIZE, NUM_WORKERS, DATA_TYPE, NUM_SEQS, SEQ_LEN)
logger = TensorBoardLogger(LOG_DIR, name="cnn", version="ds_classification")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [26]:
trainer.fit(model=cnn, datamodule=mod)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name     | Type                      | Params
-------------------------------------------------------
0 | convnet  | BasicConv1D               | 3.6 K 
1 | fcnet    | BasicFullyConnectedModule | 3.1 K 
2 | accuracy | Accuracy                  | 0     
3 | auroc    | AUROC                     | 0     
-------------------------------------------------------
6.6 K     Trainable params
0         Non-trainable params
6.6 K     Total params
0.027     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Benchmark dsEUGENE
<div class="alert alert-info" role="alert">
  <b>Benchmark dsEUGENE models</b>
</div>

In [10]:
BATCH_SIZE = 32
DATA_TYPE = "tsv"
NUM_WORKERS = 0
NUM_SEQS = 100
SEQ_LEN = 66

## Load data

In [11]:
from torchvision import transforms
from eugene.utils.seq_transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

In [12]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), 
    ReverseComplement(ohe_encoded=False), 
    OneHotEncode(), 
    ToTensor(transpose=True)
])

In [13]:
DATA_DIR = "test_{}seqs_{}/".format(NUM_SEQS, SEQ_LEN)
mod = SeqDataModule(seq_file=DATA_DIR + "test_seqs.tsv",
                    transform=data_transform,
                    num_workers=4,
                    batch_size=BATCH_SIZE,
                    load_kwargs=dict(seq_col="SEQ", target_col="ACTIVITY", low_thresh=0.09, high_thresh=0.4))
#load_kwargs=dict(target_file=DATA_DIR + "test_labels.npy", rev_seq_file=DATA_DIR + "test_rev_ohe_seqs.npy"))

## Instantiate EUGENE architecture: dsEUGENE

In [15]:
import claim.utils as cu
from eugene.models.dsEUGENE import dsEUGENE

In [16]:
cnn=dict(input_len=66, channels=[4, 16, 32], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

In [17]:
eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
cu.init_weights(eugene)
eugene



dsEUGENE(
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(15,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
      (3): Conv1d(16, 32, kernel_size=(5,), stride=(1,))
      (4): ReLU(inplace=True)
      (5): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (recurrentnet): BasicRecurrent(
    (module): LSTM(64, 32, batch_first=True)
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=32, out_features=1, bias=True)
    )
  )
  (accuracy): Accuracy()
  (auroc): AUROC()
)

In [18]:
next(eugene.parameters())[0]

tensor([[ 0.0976,  0.0978,  0.0585,  0.1228, -0.0595,  0.1031,  0.0438,  0.1063,
          0.0036, -0.1268, -0.1015, -0.0787, -0.1273,  0.1059, -0.0299],
        [ 0.0327,  0.1275, -0.0565, -0.0433,  0.0588, -0.0395,  0.0510, -0.0235,
         -0.0305,  0.0379, -0.0716, -0.0139,  0.0997, -0.0119,  0.0234],
        [ 0.0125, -0.1040, -0.0366,  0.0776, -0.0395,  0.0229,  0.0047, -0.0082,
         -0.0459,  0.0842,  0.1201,  0.0976,  0.0154,  0.0340, -0.0016],
        [ 0.1215,  0.0317,  0.1269, -0.0114,  0.1286, -0.1020, -0.1156, -0.0081,
          0.0828,  0.0014,  0.0338, -0.0051,  0.0270,  0.1262, -0.0590]],
       grad_fn=<SelectBackward0>)

## Train (and time) with PyTorch Lightning

In [19]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [20]:
LOG_DIR = "../test/test_logs/batch_size-{0}.num_workers-{1}.data_type-{2}.num_seq-{3}.seq_len-{4}".format(BATCH_SIZE, NUM_WORKERS, DATA_TYPE, NUM_SEQS, SEQ_LEN)
logger = TensorBoardLogger(LOG_DIR, name="dsEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [21]:
trainer.fit(model=eugene, datamodule=mod)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 3.6 K 
1 | recurrentnet | BasicRecurrent            | 12.5 K
2 | fcnet        | BasicFullyConnectedModule | 33    
3 | accuracy     | Accuracy                  | 0     
4 | auroc        | AUROC                     | 0     
-----------------------------------------------------------
16.1 K    Trainable params
0         Non-trainable params
16.1 K    Total params
0.065     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [22]:
# Sanity check batch size
for i, batch in enumerate(mod.train_dataloader()):
    print(i, batch[0].size())
    if i == 3:
        break

0 torch.Size([32, 1])
1 torch.Size([32, 1])


In [23]:
del logger, trainer, mod, eugene

In [None]:
%whos

# Scratch

## OLS Dataset

In [6]:
DATA_DIR = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
mod = SeqDataModule(seq_file=DATA_DIR,
                    transform=data_transform,
                    num_workers=4,
                    batch_size=BATCH_SIZE,
                    load_kwargs=dict(seq_col="SEQ", target_col="ACTIVITY_SUMRNA_NUMDNA", low_thresh=0.09, high_thresh=0.4))
#load_kwargs=dict(target_file=DATA_DIR + "test_labels.npy", rev_seq_file=DATA_DIR + "test_rev_ohe_seqs.npy"))

In [20]:
# Check the DataLoader
for i_batch, sample_batched in enumerate(mod.train_dataloader()):
    _, x, x_rev_comp, y = sample_batched
    outs = cnn(x, x_rev_comp).squeeze(dim=1)
    print(outs.size())
    if i_batch == 3:
        break

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x1554c0618950>
Traceback (most recent call last):
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
    if w.is_alive():
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


torch.Size([32])
torch.Size([32])
torch.Size([32])
torch.Size([32])


In [22]:
y

tensor([0.0560, 0.0000, 0.6861, 0.6930, 0.8344, 0.0542, 0.0153, 0.4634, 0.0633,
        0.5290, 0.5237, 0.0828, 0.0309, 0.0851, 0.0788, 0.0354, 0.4270, 0.0248,
        0.0000, 0.4262, 0.0467, 0.0056, 0.4720, 0.0827, 0.5643, 0.6700, 0.0135,
        0.5856, 0.0811, 0.4587, 0.5047, 0.0000])

In [21]:
outs

tensor([0.3632, 0.3531, 0.5275, 0.4360, 0.5880, 0.2677, 0.3268, 0.4288, 0.4799,
        0.5275, 0.2928, 0.4075, 0.4050, 0.2728, 0.3124, 0.2721, 0.4809, 0.3075,
        0.5111, 0.5577, 0.3173, 0.2157, 0.3513, 0.5247, 0.5892, 0.5645, 0.5235,
        0.3510, 0.2966, 0.2659, 0.4410, 0.2004], grad_fn=<SqueezeBackward1>)

In [104]:
mod.setup()

In [105]:
tar = mod.train_dataloader().dataset

In [106]:
np.isinf(tar.dataset.targets).sum()

0

In [18]:
trainer = pl.Trainer(gpus=1, max_epochs=2)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [19]:
trainer.fit(model=cnn, datamodule=mod)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type                      | Params
--------------------------------------------------------
0 | convnet   | BasicConv1D               | 3.6 K 
1 | fcnet     | BasicFullyConnectedModule | 1.5 K 
2 | r_squared | R2Score                   | 0     
--------------------------------------------------------
5.1 K     Trainable params
0         Non-trainable params
5.1 K     Total params
0.020     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x1554c0618950>
Traceback (most recent call last):
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x1554c0618950>
Traceback (most recent call last):
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
    self._shutdown_workers()Traceback (most recent call last):
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/cellar/users/aklie/opt/miniconda3/envs/pytorch_dev/lib/python3.7/multiproce

In [11]:
import torch.nn.functional as F

In [12]:
F.mse_loss(outs, y)

NameError: name 'outs' is not defined

## Random dataset
<div class="alert alert-info" role="alert">
  <b>Just test out the basic function of our eugene architectures on some random data</b>
</div>

### Load data

In [29]:
from torch.utils.data import dataset
from torch.utils.data import DataLoader

### Training set
training_dataset = dataset.TensorDataset(torch.randn(1000, 4, 66), torch.bernoulli(torch.empty(1000).uniform_(0,1)))
training_dataloader = DataLoader(training_dataset, batch_size=32, num_workers=1)
training_dataset[0][0].shape, training_dataset[0][1]

for i_batch, batch in enumerate(training_dataloader):
    x, y = batch
    outs = eugene(x)
    print(x.shape, y.shape, outs.shape)
    if i_batch==3:
        break

### Validation set
validation_dataset = dataset.TensorDataset(torch.randn(100, 4, 66), torch.bernoulli(torch.empty(100).uniform_(0,1)))
validation_dataloader = DataLoader(validation_dataset, batch_size=32, num_workers=4)
validation_dataset[0][0].shape, validation_dataset[0][1]

for i_batch, batch in enumerate(validation_dataloader):
    x, y = batch
    outs = eugene(x)
    print(x.shape, y.shape, outs.shape)
    if i_batch==3:
        break

## Instantiate EUGENE architecture: ssEUGENE
cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

from ssEUGENE import ssEUGENE
eugene = ssEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
eugene

x = torch.randn(10, 4, 66)
out = eugene(x)
out.shape

### Training with PyTorch Lightning

In [36]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

logger = TensorBoardLogger("random_test", name="ssEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

trainer.fit(model=eugene, train_dataloader=training_dataloader, val_dataloaders=validation_dataloader)

## MPRADataset
<div class="alert alert-info" role="alert">
  <b>Just test out the basic function of our eugene architectures on a MPRA dataset</b>
</div>

### Load data and instantiate EUGENE architecture: dsEUGENE

In [4]:
from eugene.load_data import load_csv, load_numpy
from eugene.MPRADataset import MPRADataset
from torch.utils.data import DataLoader
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor
from dsEUGENE import dsEUGENE
import claim.utils as cu

TRAIN_SEQ = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/seqs/0.09-0.4_seqs-train-0.9.txt"
TRAIN_LABEL = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-train-0.9_binary.txt"
train_seqs, train_targets = load_numpy(TRAIN_SEQ, TRAIN_LABEL, is_seq_text=True)
len(train_seqs), train_seqs[0], len(train_targets), train_targets[0]

VAL_SEQ = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/seqs/0.09-0.4_seqs-test-0.1.txt"
VAL_LABEL = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-test-0.1_binary.txt"
val_seqs, val_targets = load_numpy(VAL_SEQ, VAL_LABEL, is_seq_text=True)
len(val_seqs), val_seqs[0], len(val_targets), val_targets[0]

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=True)
])

# Instantiate a Dataset
train_dataset = MPRADataset(train_seqs, train_targets, transform=data_transform)
val_dataset = MPRADataset(val_seqs, val_targets, transform=data_transform)

# Instantiate a DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=4)

for i_batch, batch in enumerate(train_dataloader):
    x, x_rev_comp, y = batch["sequence"], batch["reverse_complement"], batch["target"]
    outs = eugene(x, x_rev_comp)
    print(x.shape, x_rev_comp.shape, y.shape, outs.shape)
    if i_batch==3:
        break
        
for i_batch, batch in enumerate(val_dataloader):
    x, x_rev_comp, y = batch["sequence"], batch["reverse_complement"], batch["target"]
    outs = eugene(x, x_rev_comp)
    print(x.shape, x_rev_comp.shape, y.shape, outs.shape)
    if i_batch==3:
        break
        
cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
cu.init_weights(eugene)

torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])


### Training with PyTorch Lightning

In [7]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

logger = TensorBoardLogger("0.18-0.4_test", name="dsEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

trainer.fit(model=eugene, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

# References