# Testing EUGENE `train` module

**Authorship:**
Adam Klie, *03/19/2022*
***
**Description:**
Notebook for testing the training of EUGENE architectures

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li></li></b>
    </ul>
</div>

# Set-up

In [2]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [3]:
import eugene as eu

Global seed set to 13
Global seed set to 13
Global seed set to 13


# Benchmark params

In [4]:
BATCH_SIZE = 32
NUM_SEQS = 1000
SEQ_LEN = 66
NUM_WORKERS = 0

In [5]:
CNN_KWARGS=dict(channels=[4, 16, 32], 
                conv_kernels=[15, 5], 
                pool_kernels=[1, 1])
RNN_KWARGS=dict(output_dim=32,
                bidirectional=True,
                batch_first=True)
FCN_KWARGS=dict(hidden_dims=[50], 
                output_dim=1)

In [6]:
MODEL = "hybrid"
STRAND = "ss"
TASK = "regression"
LOSS_FXN = "poisson"

# Instantiate model

In [7]:
eugene = eu.models.Hybrid(input_len=66,
strand=STRAND,
task=TASK,
loss_fxn=LOSS_FXN,
conv_kwargs=CNN_KWARGS,
rnn_kwargs=RNN_KWARGS,
fc_kwargs=FCN_KWARGS)
eu.models.base.init_weights(eugene)
eugene

In [None]:
eugene.loss_fxn

# Load data

In [None]:
sdata = eu.datasets.random1000()
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
    seqs_annot: 'TARGETS'

In [None]:
eu.pp.train_test_split_data(sdata, kwargs = {"split": 0.8})

SeqData object modified:
    seqs_annot:
        + TRAIN


# Train (and time) with PyTorch Lightning

In [None]:
from eugene.train import fit

In [20]:
fit(eugene, sdata=sdata, epochs=3, num_workers=4, log_dir="../_logs", out_dir="../_out/")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 3.6 K 
1 | recurrentnet | BasicRecurrent            | 16.9 K
2 | fcnet        | BasicFullyConnectedModule | 3.3 K 
3 | r_squared    | R2Score                   | 0     
-----------------------------------------------------------
23.8 K    Trainable params
0         Non-trainable params
23.8 K    Total params
0.095     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 13


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

MisconfigurationException: 'PredictionWriter' is already present in the registry. HINT: Use `override=True`.

In [None]:
sdata.seqs_annot

Unnamed: 0,TARGETS,TRAIN,PREDICTIONS
seq001,1.0,True,0.555350
seq002,0.0,True,0.549995
seq003,1.0,True,0.541370
seq004,0.0,False,0.560739
seq005,0.0,False,0.526598
...,...,...,...
seq996,0.0,True,0.536871
seq997,0.0,True,0.548602
seq998,0.0,True,0.552571
seq999,0.0,True,0.554980


In [None]:
saved_t = pd.read_csv("../_out/train_predictions.tsv", index_col=0, sep="\t")
np.allclose(saved_t["PREDICTION"].values, sdata.seqs_annot.loc[saved_t.index]["PREDICTIONS"].values)


True

In [58]:
sdata.seqs_annot.merge(preds, left_index=True, right_index=True)

Unnamed: 0,TARGETS,TRAIN,0
seq001,1.0,False,0.6329051
seq002,0.0,True,0.49575517
seq003,1.0,False,0.546878
seq004,0.0,False,0.4899232
seq005,0.0,False,0.6157094
...,...,...,...
seq996,0.0,True,0.48639736
seq997,0.0,True,0.6196456
seq998,0.0,False,0.5316826
seq999,0.0,True,0.35639912


In [39]:
pd.DataFrame(np.concatenate(v, axis=0))

Unnamed: 0,0,1,2
0,seq001,0.6047737,1.0
1,seq003,0.5640073,0.0
2,seq004,0.5120066,1.0
3,seq005,0.5677963,0.0
4,seq008,0.58250546,0.0
...,...,...,...
595,seq991,0.48263985,1.0
596,seq992,0.61272997,0.0
597,seq994,0.55296373,1.0
598,seq998,0.5503518,1.0


In [None]:
eu.train.fit(eugene, sdata=sdata, epochs=5, log_dir="../_logs") 

In [22]:
from pytorch_lightning import Trainer

In [23]:
trainer = Trainer(max_epochs=3)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [24]:
trainer.fit(eugene, sdataloader) 

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 3.6 K 
1 | recurrentnet | BasicRecurrent            | 16.9 K
2 | fcnet        | BasicFullyConnectedModule | 3.3 K 
3 | r_squared    | R2Score                   | 0     
-----------------------------------------------------------
23.8 K    Trainable params
0         Non-trainable params
23.8 K    Total params
0.095     Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

In [19]:
next(eugene.parameters())

Parameter containing:
tensor([[[-9.4785e-02, -1.7045e-03,  1.2167e-01, -1.2430e-01, -1.2435e-01,
           2.1259e-02, -7.5152e-02,  7.2229e-02,  1.0589e-01, -8.6553e-02,
          -1.1654e-01, -1.2605e-02,  1.0931e-01,  1.7115e-02, -8.7077e-03],
         [-7.3724e-02, -7.8527e-02, -5.5711e-02,  8.2650e-02, -1.0370e-01,
           1.4170e-01, -1.6458e-02,  9.5570e-03,  9.7584e-02,  1.2892e-01,
          -4.4364e-02, -1.1682e-02,  8.7450e-02, -1.9598e-02, -6.5818e-02],
         [-8.2107e-02, -3.5073e-02,  6.3604e-02,  8.6646e-02,  1.4163e-01,
          -4.8572e-02, -3.7498e-02, -7.4766e-02,  2.6753e-02, -9.3077e-02,
           1.2472e-01, -1.4999e-01,  7.3813e-02,  9.0117e-03,  7.4403e-02],
         [ 7.2231e-02,  1.1413e-01, -8.5916e-02, -7.6079e-02,  8.6604e-02,
           3.2138e-02,  6.6427e-02, -4.0387e-02,  1.5578e-01,  6.1688e-02,
           2.6640e-02, -5.1305e-02,  2.5741e-02, -1.4536e-01,  5.5284e-02]],

        [[-9.9758e-03,  2.3328e-02, -4.0814e-03, -3.5643e-03, -1.7920e-0

In [60]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [63]:
LOG_DIR = f"../_logs/batch_size-{BATCH_SIZE}.num_workers-{NUM_WORKERS}.num_seq-{NUM_SEQS}.seq_len-{SEQ_LEN}"
logger = TensorBoardLogger(LOG_DIR, name=MODEL, version=f"{STRAND}_{TASK}")
trainer = pl.Trainer(max_epochs=10, logger=logger)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [64]:
trainer.fit(eugene, sdataloader)


  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 3.6 K 
1 | recurrentnet | BasicRecurrent            | 16.9 K
2 | fcnet        | BasicFullyConnectedModule | 3.3 K 
3 | r_squared    | R2Score                   | 0     
-----------------------------------------------------------
23.8 K    Trainable params
0         Non-trainable params
23.8 K    Total params
0.095     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

---

# Scratch