# Testing EUGENE utilities and PyTorch integrations

**Authorship:**
Adam Klie, *03/22/2022*
***
**Description:**
Notebook for testing out useful Pytorch functionality and EUGENE utilities. Kind of miscellaneous testing

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Optuna integration</li></b>
    <b><li></li></b>
    </ul>
</div>

In [1]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import sys
sys.path.append("../eugene")

# CLI

In [41]:
from pytorch_lightning.trainer import Trainer

In [4]:
from dsEUGENE import dsEUGENE

In [38]:
from MPRADataModule import MPRADataModule

In [2]:
from pytorch_lightning.utilities.cli import LightningCLI

In [24]:
from pytorch_lightning.utilities.cli import LightningArgumentParser

In [27]:
config="/cellar/users/aklie/projects/EUGENE/eugene/config/base_config.yaml"

In [31]:
parser = LightningArgumentParser()

In [42]:
parser.add_lightning_class_args(Trainer, nested_key="trainer")

['trainer.logger',
 'trainer.checkpoint_callback',
 'trainer.enable_checkpointing',
 'trainer.callbacks',
 'trainer.default_root_dir',
 'trainer.gradient_clip_val',
 'trainer.gradient_clip_algorithm',
 'trainer.process_position',
 'trainer.num_nodes',
 'trainer.num_processes',
 'trainer.devices',
 'trainer.gpus',
 'trainer.auto_select_gpus',
 'trainer.tpu_cores',
 'trainer.ipus',
 'trainer.log_gpu_memory',
 'trainer.progress_bar_refresh_rate',
 'trainer.enable_progress_bar',
 'trainer.overfit_batches',
 'trainer.track_grad_norm',
 'trainer.check_val_every_n_epoch',
 'trainer.fast_dev_run',
 'trainer.accumulate_grad_batches',
 'trainer.max_epochs',
 'trainer.min_epochs',
 'trainer.max_steps',
 'trainer.min_steps',
 'trainer.max_time',
 'trainer.limit_train_batches',
 'trainer.limit_val_batches',
 'trainer.limit_test_batches',
 'trainer.limit_predict_batches',
 'trainer.val_check_interval',
 'trainer.flush_logs_every_n_steps',
 'trainer.log_every_n_steps',
 'trainer.accelerator',
 'train

In [39]:
parser.add_lightning_class_args(MPRADataModule, nested_key="data")

['data.seq_file',
 'data.batch_size',
 'data.num_workers',
 'data.transform',
 'data.split',
 'data.load_kwargs']

In [34]:
parser.add_lightning_class_args(dsEUGENE, nested_key="model")

['model.conv_kwargs',
 'model.rnn_kwargs',
 'model.fc_kwargs',
 'model.learning_rate']

In [79]:
from pytorch_lightning.loggers import TensorBoardLogger

In [80]:
logger = TensorBoardLogger("0.18-0.4_test", name="dsEUGENE")

In [84]:
import pytorch_lightning as pl

In [85]:
trainer = pl.Trainer(gpus=1, max_epochs=2, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [89]:
parser.add_lightning_class_args(pl.Trainer, nested_key="trainer")

ValueError: Group with name trainer already exists.

In [44]:
yml = parser.parse_path(cfg_path=config)

In [97]:
test = MPRADataModule(**yml['data'])

In [98]:
test.seq_file

'/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv'

In [95]:
yml["data"]["seq_file"]

'/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv'

In [91]:
yml["trainer"]["logger"]

{'class_path': 'pytorch_lightning.loggers.TensorBoardLogger',
 'init_args': {'save_dir': '/cellar/users/aklie/projects/EUGENE/results/eugene/0.18-0.4',
  'name': 'dsEUGENE',
  'version': None,
  'log_graph': False,
  'default_hp_metric': True,
  'prefix': '',
  'sub_dir': None}}

In [78]:
test.batch_size

32

In [92]:
num = 32
num2 = 8

In [93]:
[x*num2 if x != 0 else 1 for x in range(0, num//num2 + 1)]

[1, 8, 16, 24, 32]

In [49]:
import numpy as np

[8, 16, 24, 32, 40, 48, 56, 64]

In [55]:
np.arange?

[0;31mDocstring:[0m
arange([start,] stop[, step,], dtype=None, *, like=None)

Return evenly spaced values within a given interval.

Values are generated within the half-open interval ``[start, stop)``
(in other words, the interval including `start` but excluding `stop`).
For integer arguments the function is equivalent to the Python built-in
`range` function, but returns an ndarray rather than a list.

When using a non-integer step, such as 0.1, the results will often not
be consistent.  It is better to use `numpy.linspace` for these cases.

Parameters
----------
start : integer or real, optional
    Start of interval.  The interval includes this value.  The default
    start value is 0.
stop : integer or real
    End of interval.  The interval does not include this value, except
    in some cases where `step` is not an integer and floating point
    round-off affects the length of `out`.
step : integer or real, optional
    Spacing between values.  For any output `out`, this is the 

# Load a model and some test data

## Load model

In [17]:
from dsEUGENE import dsEUGENE

cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)
eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)

x = torch.randn(10, 4, 66)
out = eugene(x, x)
out.shape

## Load data

In [19]:
from torch.utils.data import dataset
from torch.utils.data import DataLoader

training_dataset = dataset.TensorDataset(torch.validationn(1000, 4, 66), torch.bernoulli(torch.empty(1000).uniform_(0,1)))
training_dataloader = DataLoader(training_dataset, batch_size=32, num_workers=4)

for i_batch, batch in enumerate(training_dataloader):
    x, y = batch
    outs = eugene(x)
    print(x.shape, y.shape, outs.shape)
    if i_batch==3:
        break

# DataModule

## Test random_split torch function

In [34]:
from torch.utils.data import random_split

In [38]:
dataset_len = len(training_dataset)
train_len = int(dataset_len*0.9)
val_len = dataset_len - train_len

In [40]:
train_dataset, val_dataset = random_split(training_dataset, [train_len, val_len])

In [42]:
len(train_dataset)

272642

# Torchmetrics logging

In [21]:
import torchmetrics

In [48]:
acc = torchmetrics.Accuracy()
preds = torch.round(torch.sigmoid(outs))
y.long().unsqueeze(dim=1)
acc.update(preds, y.long().unsqueeze(dim=1))
acc.compute()

In [91]:
auroc = torchmetrics.AUROC()
auroc.update(torch.sigmoid(outs), y.long().unsqueeze(dim=1))
auroc.compute()

# Optuna integration

In [2]:
from MPRADataModule import MPRADataModule

## Load data

In [3]:
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

In [4]:
data_transform = 

In [6]:
OLS_TSV = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
mod = MPRADataModule(seq_file=OLS_TSV,
                     transform=data_transform,
                     num_workers=4,
                     batch_size=128,
                     load_kwargs=dict(target_col="ACTIVITY_SUMRNA_NUMDNA", low_thresh=0.18, high_thresh=0.4))

Compose(
    <transforms.Augment object at 0x155550d4f610>
    <transforms.ReverseComplement object at 0x155550d4f910>
    <transforms.OneHotEncode object at 0x155550d4f310>
    <transforms.ToTensor object at 0x155550d4ff50>
)


In [10]:
mod.setup()

  fn(*args, **kwargs)


Compose(
    <transforms.Augment object at 0x155550d4f610>
    <transforms.ReverseComplement object at 0x155550d4f910>
    <transforms.OneHotEncode object at 0x155550d4f310>
    <transforms.ToTensor object at 0x155550d4ff50>
)


In [14]:
mod.train_dataloader().dataset[0]["sequence"].shape

torch.Size([4, 66])

## Test objective

In [13]:
from dsEUGENE import objective

In [14]:
import optuna

In [15]:
trial = optuna.trial.FixedTrial({"fcn_n_layers": 1, "fcn_dropout":0.2, "fcn_n_units_l0":4})

In [22]:
objective(trial, mod, max_epochs=2)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name         | Type                 | Params
------------------------------------------------------
0 | convnet      | BasicConv1D          | 976   
1 | recurrentnet | BasicRecurrent       | 8 K   
2 | fcnet        | FullyConnectedModule | 137   
3 | accuracy     | Accuracy             | 0     
4 | auroc        | AUROC                | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

0.7410488128662109

In [23]:
trial.number

0

In [8]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-03-21 19:31:52,680][0m A new study created in memory with name: no-name-33c62b8b-30b6-4731-afa9-5017148cd173[0m


In [16]:
from optuna.distributions import UniformDistribution

In [17]:
trial = optuna.trial.create_trial(
    params={"x": 2.0},
    distributions={"x": UniformDistribution(0, 10)},
    value=4.0,
)

  after removing the cwd from sys.path.


In [18]:
study.add_trial(trial)

  """Entry point for launching an IPython kernel.


In [19]:
objective(trial)

ValueError: The value of the parameter 'fcn_n_layers' is not found. Please set it at the construction of the FrozenTrial object.