# Testing EUGENE utilities and PyTorch integrations

**Authorship:**
Adam Klie, *03/22/2022*
***
**Description:**
Notebook for testing out useful Pytorch functionality and EUGENE utilities. Kind of miscellaneous testing

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li>Optuna integration</li></b>
    <b><li></li></b>
    </ul>
</div>

In [1]:
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import sys
sys.path.append("../scripts")

# Load a model and some test data

## Load model

In [17]:
from dsEUGENE import dsEUGENE

cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)
eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)

x = torch.randn(10, 4, 66)
out = eugene(x, x)
out.shape

## Load data

In [19]:
from torch.utils.data import dataset
from torch.utils.data import DataLoader

training_dataset = dataset.TensorDataset(torch.validationn(1000, 4, 66), torch.bernoulli(torch.empty(1000).uniform_(0,1)))
training_dataloader = DataLoader(training_dataset, batch_size=32, num_workers=4)

for i_batch, batch in enumerate(training_dataloader):
    x, y = batch
    outs = eugene(x)
    print(x.shape, y.shape, outs.shape)
    if i_batch==3:
        break

# DataModule

## Test random_split torch function

In [34]:
from torch.utils.data import random_split

In [38]:
dataset_len = len(training_dataset)
train_len = int(dataset_len*0.9)
val_len = dataset_len - train_len

In [40]:
train_dataset, val_dataset = random_split(training_dataset, [train_len, val_len])

In [42]:
len(train_dataset)

272642

# Torchmetrics logging

In [21]:
import torchmetrics

In [48]:
acc = torchmetrics.Accuracy()
preds = torch.round(torch.sigmoid(outs))
y.long().unsqueeze(dim=1)
acc.update(preds, y.long().unsqueeze(dim=1))
acc.compute()

In [91]:
auroc = torchmetrics.AUROC()
auroc.update(torch.sigmoid(outs), y.long().unsqueeze(dim=1))
auroc.compute()

# Optuna integration

In [2]:
from MPRADataModule import MPRADataModule

## Load data

In [3]:
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

In [4]:
data_transform = 

In [6]:
OLS_TSV = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
mod = MPRADataModule(seq_file=OLS_TSV,
                     transform=data_transform,
                     num_workers=4,
                     batch_size=128,
                     load_kwargs=dict(target_col="ACTIVITY_SUMRNA_NUMDNA", low_thresh=0.18, high_thresh=0.4))

Compose(
    <transforms.Augment object at 0x155550d4f610>
    <transforms.ReverseComplement object at 0x155550d4f910>
    <transforms.OneHotEncode object at 0x155550d4f310>
    <transforms.ToTensor object at 0x155550d4ff50>
)


In [10]:
mod.setup()

  fn(*args, **kwargs)


Compose(
    <transforms.Augment object at 0x155550d4f610>
    <transforms.ReverseComplement object at 0x155550d4f910>
    <transforms.OneHotEncode object at 0x155550d4f310>
    <transforms.ToTensor object at 0x155550d4ff50>
)


In [14]:
mod.train_dataloader().dataset[0]["sequence"].shape

torch.Size([4, 66])

## Test objective

In [13]:
from dsEUGENE import objective

In [14]:
import optuna

In [15]:
trial = optuna.trial.FixedTrial({"fcn_n_layers": 1, "fcn_dropout":0.2, "fcn_n_units_l0":4})

In [22]:
objective(trial, mod, max_epochs=2)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name         | Type                 | Params
------------------------------------------------------
0 | convnet      | BasicConv1D          | 976   
1 | recurrentnet | BasicRecurrent       | 8 K   
2 | fcnet        | FullyConnectedModule | 137   
3 | accuracy     | Accuracy             | 0     
4 | auroc        | AUROC                | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

0.7410488128662109

In [23]:
trial.number

0

In [8]:
study = optuna.create_study(direction="maximize")

[32m[I 2022-03-21 19:31:52,680][0m A new study created in memory with name: no-name-33c62b8b-30b6-4731-afa9-5017148cd173[0m


In [16]:
from optuna.distributions import UniformDistribution

In [17]:
trial = optuna.trial.create_trial(
    params={"x": 2.0},
    distributions={"x": UniformDistribution(0, 10)},
    value=4.0,
)

  after removing the cwd from sys.path.


In [18]:
study.add_trial(trial)

  """Entry point for launching an IPython kernel.


In [19]:
objective(trial)

ValueError: The value of the parameter 'fcn_n_layers' is not found. Please set it at the construction of the FrozenTrial object.