# Ray et al 2013 Training 
**Authorship:**
Adam Klie, *08/31/2022*
***
**Description:**
Notebook to perform simple training of *single task* and *multitask* models on the Ray et al dataset.
***

In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import logging
import torch
import numpy as np
import pandas as pd
import eugene as eu

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: Quadro RTX 5000


  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)


Already up to date.


In [2]:
eu.settings.dataset_dir = "/cellar/users/aklie/data/eugene/ray13"
eu.settings.output_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/output/ray13"
eu.settings.logging_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/logs/ray13"
eu.settings.config_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/configs/ray13"
eu.settings.verbosity = logging.ERROR

# Load in the SetA training `SeqData`'s for single task and multi-task models

In [3]:
# Load in the training SetA processed data for single task and multitask models
sdata_training_ST = eu.dl.read_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "norm_setA_processed_ST.h5sd"))
sdata_training_MT = eu.dl.read_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "norm_setA_processed_MT.h5sd"))

In [4]:
# Grab the prediction columns for single task and multitask
target_mask_ST = sdata_training_ST.seqs_annot.columns.str.contains("RNCMPT")
target_cols_ST = sdata_training_ST.seqs_annot.columns[target_mask_ST]
target_mask_MT = sdata_training_MT.seqs_annot.columns.str.contains("RNCMPT")
target_cols_MT = sdata_training_MT.seqs_annot.columns[target_mask_MT]

# Train single task models

In [6]:
# Instantiation function
from pytorch_lightning import seed_everything
def prep_new_model(
    seed,
    conv_dropout = 0,
    fc_dropout = 0,
    batchnorm = True
):
    model = eu.models.DeepBind(
        input_len=41, # Length of padded sequences
        output_dim=1, # Number of multitask outputs
        strand="ss",
        task="regression",
        conv_kwargs=dict(channels=[4, 16], conv_kernels=[16], dropout_rates=conv_dropout, batchnorm=batchnorm),
        mp_kwargs=dict(kernel_size=8),
        fc_kwargs=dict(hidden_dims=[32], dropout_rate=fc_dropout, batchnorm=batchnorm),
        optimizer="sgd",
        lr=0.0005,
        scheduler_patience=3
    )

    # Set a seed
    seed_everything(seed)
    
    # Initialize the model prior to conv filter initialization
    eu.models.base.init_weights(model)

    # Return the model
    return model 

In [7]:
# Test out a model before training
model = prep_new_model(0)
print(model.summary())
sdataloader = sdata_training_ST[:64].to_dataset(transform_kwargs={"transpose": True}).to_dataloader()
test_seqs = next(iter(sdataloader))
print(model(test_seqs[1], test_seqs[2]).size())

Global seed set to 0


Model: DeepBind
Input length: 41
Output dimension: 1
Strand: ss
Task: regression
Aggregation: None
Loss function: mse_loss
Optimizer: sgd
	Optimizer parameters: {}
Learning rate: 0.0005
Scheduler: lr_scheduler
Scheduler patience: 3
  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 1.1 K 
3 | fcn       | BasicFullyConnectedModule | 1.8 K 
--------------------------------------------------------
2.9 K     Trainable params
0         Non-trainable params
2.9 K     Total params
0.011     Total estimated model params size (MB)
No transforms given, assuming just need to tensorize).
torch.Size([64, 1])


In [None]:
# Train a model on each target prediction!
for i, target_col in enumerate(target_cols_ST):
    print(f"Training DeepBind SingleTask model on {target_col}")

    # Initialize the model
    model = prep_new_model(seed=i, conv_dropout=0.5, fc_dropout=0.5, batchnorm=True)

    # Train the model
    eu.train.fit(
        model=model, 
        sdata=sdata_training_ST, 
        gpus=1, 
        target=target_col,
        train_key="train_val",
        epochs=10,
        early_stopping_metric="val_loss",
        early_stopping_patience=3,
        batch_size=64,
        num_workers=4,
        name="DeepBind_ST",
        seed=i,
        version=target_col,
        verbosity=logging.ERROR
    )
    
    # Get predictions on the training data
    eu.settings.dl_num_workers = 0
    eu.predict.train_val_predictions(
        model,
        sdata=sdata_training_ST, 
        target=target_col,
        train_key="train_val",
        name="DeepBind_ST",
        suffix="_ST",
        version=target_col
    )
    del model 
sdata_training_ST.write_h5sd(os.path.join(eu.settings.output_dir, "DeepBind_ST", "norm_training_predictions_ST.h5sd"))

Global seed set to 0
Global seed set to 0


Training DeepBind SingleTask model on RNCMPT00001
Dropping 610 sequences with NaN targets.
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 1.1 K 
3 | fcn       | BasicFullyConnectedModule | 1.8 K 
--------------------------------------------------------
2.9 K     Trainable params
0         Non-trainable params
2.9 K     Total params
0.011     Total estimated model params size (MB)


No transforms given, assuming just need to tensorize).


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.919


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.915


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.908


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.902


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.896


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.886


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.880


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.870
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


No transforms given, assuming just need to tensorize).


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Global seed set to 1
Global seed set to 1


SeqData object modified:
    seqs_annot:
        + RNCMPT00001_predictions_ST
Training DeepBind SingleTask model on RNCMPT00002
Dropping 609 sequences with NaN targets.
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 1.1 K 
3 | fcn       | BasicFullyConnectedModule | 1.8 K 
--------------------------------------------------------
2.9 K     Trainable params
0         Non-trainable params
2.9 K     Total params
0.011     Total estimated model params size (MB)


No transforms given, assuming just need to tensorize).


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 1


Training: 0it [00:00, ?it/s]

# Train multi-task model

In [5]:
# Define the version for saving
model_version = 2

In [11]:
# Instantiate the model
conv_dropout = 0.25
fc_dropout = 0.25
batchnorm = True
model = eu.models.DeepBind(
    input_len=41, # Length of padded sequences
    output_dim=len(target_cols_MT), # Number of multitask outputs
    strand="ss",
    task="regression",
    optimizer="sgd",
    optimizer_kwargs={"nesterov": True, "momentum":0.9, "weight_decay": 1e-5},
    lr=0.0005,
    scheduler_patience=2,
    conv_kwargs=dict(channels=[4, 1024], conv_kernels=[16], dropout_rates=conv_dropout, batchnorm=batchnorm),
    fc_kwargs=dict(hidden_dims=[512], dropout_rate=fc_dropout, batchnorm=batchnorm)
)
model.summary(), model_version

Model: DeepBind
Input length: 41
Output dimension: 233
Strand: ss
Task: regression
Aggregation: max
Loss function: mse_loss
Optimizer: sgd
	Optimizer parameters: {'nesterov': True, 'momentum': 0.9, 'weight_decay': 1e-05}
Learning rate: 0.0005
Scheduler: lr_scheduler
Scheduler patience: 2


(  | Name      | Type                      | Params
 --------------------------------------------------------
 0 | hp_metric | R2Score                   | 0     
 1 | convnet   | BasicConv1D               | 68.6 K
 2 | max_pool  | MaxPool1d                 | 0     
 3 | avg_pool  | AvgPool1d                 | 0     
 4 | fcn       | BasicFullyConnectedModule | 1.2 M 
 --------------------------------------------------------
 1.2 M     Trainable params
 0         Non-trainable params
 1.2 M     Total params
 4.953     Total estimated model params size (MB),
 2)

In [None]:
# Train the model
eu.train.fit(
    model=model,
    sdata=sdata_training_MT,
    gpus=1,
    target=target_cols_MT,
    train_key="train_val",
    epochs=200,
    early_stopping_metric="val_loss",
    early_stopping_patience=5,
    batch_size=1024,
    num_workers=0,
    name="DeepBind_MT",
    seed=42,
    version=f"v{model_version}",
    verbosity=logging.ERROR
)
# Get predictions on the training data
eu.settings.dl_num_workers = 0
eu.predict.train_val_predictions(
    model,
    sdata=sdata_training_MT, 
    target=target_cols_MT,
    train_key="train_val",
    name="DeepBind_MT",
    suffix="_MT",
    version=f"v{model_version}"
)

Global seed set to 42


Dropping 0 sequences with NaN targets.
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | convnet   | BasicConv1D               | 68.6 K
2 | max_pool  | MaxPool1d                 | 0     
3 | avg_pool  | AvgPool1d                 | 0     
4 | fcn       | BasicFullyConnectedModule | 1.2 M 
--------------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.953     Total estimated model params size (MB)


No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 42
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.866


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.865


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.865


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.864


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.864


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.863


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.863


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.862


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.862


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.861


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.860


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.860


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.860


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.859


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.859


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.858


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.858


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.858


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.857


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.857


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.856


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.855


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.855


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.854


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.854


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.854


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.853


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.853


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.852


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.852


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.852


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.851


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.850


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.850


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.850


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.849


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.849


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.848


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.848


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.848


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.847


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.847


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.846


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.846


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.846


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.845


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.845


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.845


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.844


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.844


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.844


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.843


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.843


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.842


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.842


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.837


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.837


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.837


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.836


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.836


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.835


In [14]:
# Save the predictions!
sdata_training_MT.write_h5sd(os.path.join(eu.settings.output_dir, "DeepBind_MT", f"norm_training_predictions_v{model_version}_MT.h5sd"))

In [15]:
# Double check we predicted on all the columns
np.sum(sdata_training_MT.seqs_annot.columns.str.contains("RNCMPT"))

466

In [16]:
# Move on to the next model version if training multiple
model_version = model_version + 1

---

# Scratch

In [None]:
# Test conv kernel initialization, this needs a fix!
cnn = prep_new_model(seed=0, arch="CNN", config=os.path.join(eu.settings.config_dir, "ssCNN.yaml"))
jores = prep_new_model(seed=0, arch="Jores21CNN", config=os.path.join(eu.settings.config_dir, "Jores21CNN.yaml"))
torch.all(cnn.convnet.module[0].weight[0] == jores.biconv.kernels[0][0])