# Ray et al 2013 Multi Task Training 
**Authorship:**
Adam Klie, *08/31/2022*
***
**Description:**
Notebook to perform simple training of *multi task* models on the Ray et al dataset.
***

In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import logging
import torch
import numpy as np
import pandas as pd
import eugene as eu

Global seed set to 13
2022-09-03 17:38:02.838523: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-03 17:38:02.838581: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)


In [2]:
eu.settings.dataset_dir = "../../../_datasets/ray13"
eu.settings.output_dir = "../../../_output/ray13"
eu.settings.logging_dir = "../../../_logs/ray13"
eu.settings.config_dir = "../../../_configs/ray13"
eu.settings.verbosity = logging.ERROR

# Test experimentally
eu.settings.dl_num_workers = 4
eu.settings.batch_size = 1024
eu.settings.dl_pin_memory_gpu_training = True

In [3]:
sdata_training = eu.dl.read_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "norm_setA_MT_sub.h5sd"))

In [5]:
# Grab the prediction columns
target_mask = sdata_training.seqs_annot.columns.str.contains("RNCMPT")
target_cols = sdata_training.seqs_annot.columns[target_mask]

In [8]:
model_version = 0

In [12]:
conv_dropout = 0.5
fc_dropout = 0.5
batchnorm = True
model = eu.models.DeepBind(
    input_len=41, # Length of padded sequences
    output_dim=len(target_cols), # Number of multitask outputs
    strand="ss",
    task="regression",
    optimizer="sgd",
    lr=0.0005,
    scheduler_patience=3,
    conv_kwargs=dict(channels=[4, 512], conv_kernels=[16], dropout_rates=conv_dropout, batchnorm=batchnorm),
    mp_kwargs=dict(kernel_size=8),
    fc_kwargs=dict(hidden_dims=[1024], dropout_rate=fc_dropout, batchnorm=batchnorm)
)

model.summary()
model

Model: DeepBind
Input length: 41
Output dimension: 233
Strand: ss
Task: regression
Aggregation: None
Loss function: mse_loss
Optimizer: sgd
	Optimizer parameters: {}
Learning rate: 0.0005
Scheduler: lr_scheduler
Scheduler patience: 3


DeepBind(
  (hp_metric): R2Score()
  (max_pool): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 512, kernel_size=(16,), stride=(1,))
      (1): ReLU()
      (2): Dropout(p=0.5, inplace=False)
      (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (fcn): BasicFullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=1664, out_features=1024, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.5, inplace=False)
      (3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (4): Linear(in_features=1024, out_features=233, bias=True)
    )
  )
)

In [13]:
# Train the model
eu.train.fit(
    model=model,
    sdata=sdata_training,
    #gpus=1,
    target=target_cols,
    train_key="train_val",
    epochs=5,
    early_stopping_metric="val_loss",
    early_stopping_patience=5,
    batch_size=64,
    num_workers=4,
    name="DeepBind_MT",
    seed=0,
    version=f"v{model_version}",
    verbosity=logging.ERROR
)

# Get predictions on the training data
eu.settings.dl_num_workers = 0
eu.predict.train_val_predictions(
    model,
    sdata=sdata_training, 
    target=target_cols,
    train_key="train_val",
    name="DeepBind_MT",
    version=f"v{model_version}"
)

Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 34.3 K
3 | fcn       | BasicFullyConnectedModule | 1.9 M 
--------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.921     Total estimated model params size (MB)


Dropping 0 sequences with NaN targets.
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.062


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

  self[k1] = value[k2]


SeqData object modified:
    seqs_annot:
        + RNCMPT00234_predictions, RNCMPT00054_predictions, RNCMPT00285_predictions, RNCMPT00124_predictions, RNCMPT00224_predictions, RNCMPT00209_predictions, RNCMPT00037_predictions, RNCMPT00187_predictions, RNCMPT00284_predictions, RNCMPT00156_predictions, RNCMPT00126_predictions, RNCMPT00259_predictions, RNCMPT00088_predictions, RNCMPT00001_predictions, RNCMPT00216_predictions, RNCMPT00073_predictions, RNCMPT00050_predictions, RNCMPT00199_predictions, RNCMPT00028_predictions, RNCMPT00131_predictions, RNCMPT00217_predictions, RNCMPT00177_predictions, RNCMPT00225_predictions, RNCMPT00081_predictions, RNCMPT00253_predictions, RNCMPT00185_predictions, RNCMPT00257_predictions, RNCMPT00134_predictions, RNCMPT00159_predictions, RNCMPT00165_predictions, RNCMPT00120_predictions, RNCMPT00077_predictions, RNCMPT00155_predictions, RNCMPT00140_predictions, RNCMPT00279_predictions, RNCMPT00235_predictions, RNCMPT00160_predictions, RNCMPT00085_predictions,

In [18]:
sdata_training.write_h5sd(os.path.join(eu.settings.output_dir, f"norm_training_predictions_MT_v{model_version}.h5sd"))

: 

In [None]:
model_version = model_version + 1

---

# Scratch