# Testing EUGENE `train` module

**Authorship:**
Adam Klie, *03/19/2022*
***
**Description:**
Notebook for testing the EUGENE `train` module. 

# Set-up

In [20]:
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

: 

: 

In [2]:
import eugene as eu

Global seed set to 13
2022-09-10 22:10:33.655385: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-10 22:10:36.579326: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-10 22:10:36.579362: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-10 22:10:36.839644: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-10 22:10:40.019932: W tensorfl

In [16]:
eu.settings.batch_size = 128
eu.settings.dl_num_workers = 4
eu.settings.logging_dir = "../../_logs"
eu.settings.output_dir = "../../_outputs"

# Load model

In [4]:
# Manual configuration
SEQ_LEN = 66
OUT_DIM = 10
CNN_KWARGS = dict(
    channels=[4, 16, 32], 
    conv_kernels=[15, 5], 
    pool_kernels=[1, 1]
)
RNN_KWARGS = dict(
    output_dim=32,
    bidirectional=True,
    batch_first=True
)
FCN_KWARGS = dict(
    hidden_dims=[50]
)
STRAND = "ss"
TASK = "regression"
LOSS_FXN = "mse"

# Instantiate model

In [5]:
eugene = eu.models.Hybrid(
    input_len=SEQ_LEN,
    output_dim=OUT_DIM,
    strand=STRAND,
    task=TASK,
    loss_fxn=LOSS_FXN,
    conv_kwargs=CNN_KWARGS,
    rnn_kwargs=RNN_KWARGS,
    fc_kwargs=FCN_KWARGS
)

In [7]:
eu.models.init_weights(eugene)
eugene

Hybrid(
  (hp_metric): R2Score()
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(15,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
      (3): Conv1d(16, 32, kernel_size=(5,), stride=(1,))
      (4): ReLU()
      (5): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (recurrentnet): BasicRecurrent(
    (module): LSTM(32, 32, batch_first=True, bidirectional=True)
  )
  (fcnet): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=64, out_features=50, bias=True)
      (1): ReLU()
      (2): Linear(in_features=50, out_features=10, bias=True)
    )
  )
)

# Load data

In [8]:
sdata = eu.datasets.random1000_10()
sdata

SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = None
ohe_rev_seqs = None
seqs_annot: 'LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9', 'ACTIVITY_0', 'ACTIVITY_1', 'ACTIVITY_2', 'ACTIVITY_3', 'ACTIVITY_4', 'ACTIVITY_5', 'ACTIVITY_6', 'ACTIVITY_7', 'ACTIVITY_8', 'ACTIVITY_9'
pos_annot: None
seqsm: None
uns: None

In [9]:
eu.pp.prepare_seqs_sdata(sdata)
sdata

  0%|          | 0/3 [00:00<?, ?it/s]

One-hot encoding sequences:   0%|          | 0/1000 [00:00<?, ?it/s]

SeqData object modified:
	ohe_seqs: None -> 1000 ohe_seqs added
	ohe_rev_seqs: None -> 1000 ohe_rev_seqs added
    seqs_annot:
        + train_val


SeqData object with = 1000 seqs
seqs = (1000,)
names = (1000,)
rev_seqs = None
ohe_seqs = (1000, 4, 66)
ohe_rev_seqs = (1000, 4, 66)
seqs_annot: 'LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9', 'ACTIVITY_0', 'ACTIVITY_1', 'ACTIVITY_2', 'ACTIVITY_3', 'ACTIVITY_4', 'ACTIVITY_5', 'ACTIVITY_6', 'ACTIVITY_7', 'ACTIVITY_8', 'ACTIVITY_9', 'train_val'
pos_annot: None
seqsm: None
uns: None

# Train 

In [17]:
targets = [f"LABEL_{i}" for i in range(OUT_DIM)]
targets

['LABEL_0',
 'LABEL_1',
 'LABEL_2',
 'LABEL_3',
 'LABEL_4',
 'LABEL_5',
 'LABEL_6',
 'LABEL_7',
 'LABEL_8',
 'LABEL_9']

In [19]:
eu.train.fit(eugene, sdata=sdata, target_keys=targets, epochs=1)

Global seed set to 13
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | hp_metric    | R2Score                   | 0     
1 | convnet      | BasicConv1D               | 3.6 K 
2 | recurrentnet | BasicRecurrent            | 16.9 K
3 | fcnet        | BasicFullyConnectedModule | 3.8 K 
-----------------------------------------------------------
24.2 K    Trainable params
0         Non-trainable params
24.2 K    Total params
0.097     Total estimated model params size (MB)


Dropping 0 sequences with NaN targets.
No transforms given, assuming just need to tensorize.
No transforms given, assuming just need to tensorize.


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 13


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.370


In [36]:
eu.predict.train_val_predictions(eugene, sdata=sdata, target_label=targets, gpus=1, out_dir=OUT_DIR)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HERE ['LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9']
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

                 0           1           2           3            4  \
seq000  0.22151977  0.42979366  0.40367535  0.62748384   0.10740046   
seq001  0.20679933   0.3980499  0.38247472   0.5965491   0.10531287   
seq002  0.20890895  0.40850383  0.39109713   0.6113459  0.106457405   
seq005  0.21345711  0.41622812  0.40266123  0.62153447  0.107835375   
seq006  0.20972896  0.41624838  0.39150807    0.612308   0.11024589   
...            ...         ...         ...         ...          ...   
seq980   0.2183375   0.4252531   0.4016859   0.6296614   0.10766679   
seq986  0.22487453  0.42974097  0.40623972  0.63793683   0.10833167   
seq987  0.22622463  0.42148906  0.39546132   0.6181561  0.103938796   
seq991  0.21146774   0.4153229  0.38607493   0.6068795  0.112112425   
seq993  0.21846488   0.4268222  0.39783695   0.6232819   0.10530958   

                 5           6           7           8           9  
seq000  0.16664916  0.31013647  0.59695965  0.23060434  0.40394092  
seq001  0

In [None]:
saved_t = pd.read_csv("../_out/train_predictions.tsv", index_col=0, sep="\t")
np.allclose(saved_t["PREDICTION"].values, sdata.seqs_annot.loc[saved_t.index]["PREDICTIONS"].values)

True

In [58]:
sdata.seqs_annot.merge(preds, left_index=True, right_index=True)

Unnamed: 0,TARGETS,TRAIN,0
seq001,1.0,False,0.6329051
seq002,0.0,True,0.49575517
seq003,1.0,False,0.546878
seq004,0.0,False,0.4899232
seq005,0.0,False,0.6157094
...,...,...,...
seq996,0.0,True,0.48639736
seq997,0.0,True,0.6196456
seq998,0.0,False,0.5316826
seq999,0.0,True,0.35639912


In [39]:
pd.DataFrame(np.concatenate(v, axis=0))

Unnamed: 0,0,1,2
0,seq001,0.6047737,1.0
1,seq003,0.5640073,0.0
2,seq004,0.5120066,1.0
3,seq005,0.5677963,0.0
4,seq008,0.58250546,0.0
...,...,...,...
595,seq991,0.48263985,1.0
596,seq992,0.61272997,0.0
597,seq994,0.55296373,1.0
598,seq998,0.5503518,1.0


In [None]:
eu.train.fit(eugene, sdata=sdata, epochs=5, log_dir="../_logs") 

In [22]:
from pytorch_lightning import Trainer

In [23]:
trainer = Trainer(max_epochs=3)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [24]:
trainer.fit(eugene, sdataloader) 

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 3.6 K 
1 | recurrentnet | BasicRecurrent            | 16.9 K
2 | fcnet        | BasicFullyConnectedModule | 3.3 K 
3 | r_squared    | R2Score                   | 0     
-----------------------------------------------------------
23.8 K    Trainable params
0         Non-trainable params
23.8 K    Total params
0.095     Total estimated model params size (MB)
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

In [19]:
next(eugene.parameters())

Parameter containing:
tensor([[[-9.4785e-02, -1.7045e-03,  1.2167e-01, -1.2430e-01, -1.2435e-01,
           2.1259e-02, -7.5152e-02,  7.2229e-02,  1.0589e-01, -8.6553e-02,
          -1.1654e-01, -1.2605e-02,  1.0931e-01,  1.7115e-02, -8.7077e-03],
         [-7.3724e-02, -7.8527e-02, -5.5711e-02,  8.2650e-02, -1.0370e-01,
           1.4170e-01, -1.6458e-02,  9.5570e-03,  9.7584e-02,  1.2892e-01,
          -4.4364e-02, -1.1682e-02,  8.7450e-02, -1.9598e-02, -6.5818e-02],
         [-8.2107e-02, -3.5073e-02,  6.3604e-02,  8.6646e-02,  1.4163e-01,
          -4.8572e-02, -3.7498e-02, -7.4766e-02,  2.6753e-02, -9.3077e-02,
           1.2472e-01, -1.4999e-01,  7.3813e-02,  9.0117e-03,  7.4403e-02],
         [ 7.2231e-02,  1.1413e-01, -8.5916e-02, -7.6079e-02,  8.6604e-02,
           3.2138e-02,  6.6427e-02, -4.0387e-02,  1.5578e-01,  6.1688e-02,
           2.6640e-02, -5.1305e-02,  2.5741e-02, -1.4536e-01,  5.5284e-02]],

        [[-9.9758e-03,  2.3328e-02, -4.0814e-03, -3.5643e-03, -1.7920e-0

In [60]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [63]:
LOG_DIR = f"../_logs/batch_size-{BATCH_SIZE}.num_workers-{NUM_WORKERS}.num_seq-{NUM_SEQS}.seq_len-{SEQ_LEN}"
logger = TensorBoardLogger(LOG_DIR, name=MODEL, version=f"{STRAND}_{TASK}")
trainer = pl.Trainer(max_epochs=10, logger=logger)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [64]:
trainer.fit(eugene, sdataloader)


  | Name         | Type                      | Params
-----------------------------------------------------------
0 | convnet      | BasicConv1D               | 3.6 K 
1 | recurrentnet | BasicRecurrent            | 16.9 K
2 | fcnet        | BasicFullyConnectedModule | 3.3 K 
3 | r_squared    | R2Score                   | 0     
-----------------------------------------------------------
23.8 K    Trainable params
0         Non-trainable params
23.8 K    Total params
0.095     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

---

# Scratch