# Ray et al 2013 Single Task Training 
**Authorship:**
Adam Klie, *08/31/2022*
***
**Description:**
Notebook to perform simple training of *single task* models on the Ray et al dataset.
***

In [8]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import logging
import torch
import numpy as np
import pandas as pd
import eugene as eu

In [9]:
eu.settings.dataset_dir = "../../../_datasets/ray13"
eu.settings.output_dir = "../../../_output/ray13"
eu.settings.logging_dir = "../../../_logs/ray13"
eu.settings.config_dir = "../../../_configs/ray13"
eu.settings.verbosity = logging.ERROR

# Load in the SetA training `SeqData`

In [10]:
# Load in the training SetA processed data
sdata_training = eu.dl.read_h5sd(os.path.join(eu.settings.dataset_dir, eu.settings.dataset_dir, "norm_setA_sub.h5sd"))

In [11]:
# Grab the prediction columns
target_mask = sdata_training.seqs_annot.columns.str.contains("RNCMPT")
target_cols = sdata_training.seqs_annot.columns[target_mask]

In [12]:
from pytorch_lightning import seed_everything
def prep_new_model(
    seed,
    conv_dropout = 0,
    fc_dropout = 0,
    batchnorm = True
):
    model = eu.models.DeepBind(
        input_len=41, # Length of padded sequences
        output_dim=1, # Number of multitask outputs
        strand="ss",
        task="regression",
        lr=0.0005,
        scheduler_patience=3,
        conv_kwargs=dict(channels=[4, 32], conv_kernels=[16], dropout_rates=conv_dropout, batchnorm=batchnorm),
        mp_kwargs=dict(kernel_size=8),
        fc_kwargs=dict(hidden_dims=[32], dropout_rate=fc_dropout, batchnorm=batchnorm),
        optimizer="sgd"
    )

    # Set a seed
    seed_everything(seed)
    
    # Initialize the model prior to conv filter initialization
    eu.models.base.init_weights(model)

    # Return the model
    return model 

In [13]:
# Test out a model before training
model = prep_new_model(0)
print(model.summary())
sdataloader = sdata_training[:64].to_dataset(transform_kwargs={"transpose": True}).to_dataloader()
test_seqs = next(iter(sdataloader))
print(model(test_seqs[1], test_seqs[2]).size())

Global seed set to 0


Model: DeepBind
Input length: 41
Output dimension: 1
Strand: ss
Task: regression
Aggregation: None
Loss function: mse_loss
Optimizer: sgd
	Optimizer parameters: {}
Learning rate: 0.0005
Scheduler: lr_scheduler
Scheduler patience: 3
  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 2.1 K 
3 | fcn       | BasicFullyConnectedModule | 3.5 K 
--------------------------------------------------------
5.6 K     Trainable params
0         Non-trainable params
5.6 K     Total params
0.022     Total estimated model params size (MB)
No transforms given, assuming just need to tensorize).
torch.Size([64, 1])


In [14]:
# Train a model on each target prediction!
for i, target_col in enumerate(target_cols[:5]):
    print(f"Training DeepBind SingleTask model on {target_col}")

    # Initialize the model
    model = prep_new_model(
        seed=i
    )

    # Train the model
    eu.train.fit(
        model=model, 
        sdata=sdata_training, 
        #gpus=1, 
        target=target_col,
        train_key="train_val",
        epochs=5,
        early_stopping_metric="val_loss",
        early_stopping_patience=5,
        batch_size=64,
        num_workers=4,
        name="DeepBind_ST",
        seed=0,
        version=target_col,
        verbosity=logging.ERROR
    )
    
    # Get predictions on the training data
    eu.settings.dl_num_workers = 0
    eu.predict.train_val_predictions(
        model,
        sdata=sdata_training, 
        target=target_col,
        train_key="train_val",
        name="DeepBind_ST",
        version=target_col
    )
    del model 
sdata_training.write_h5sd(os.path.join(eu.settings.output_dir, "norm_training_predictions_ST.h5sd"))

Global seed set to 0
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 2.1 K 
3 | fcn       | BasicFullyConnectedModule | 3.5 K 
--------------------------------------------------------
5.6 K     Trainable params
0         Non-trainable params
5.6 K     Total params
0.022     Total estimated model params size (MB)


Training DeepBind SingleTask model on RNCMPT00001
Dropping 2 sequences with NaN targets.
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.167


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.162


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.158


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.157


Validating: 0it [00:00, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 1
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 2.1 K 
3 | fcn       | BasicFullyConnectedModule | 3.5 K 
--------------------------------------------------------
5.6 K     Trainable params
0         Non-trainable params
5.6 K     Total params
0.022     Total estimated model params size (MB)


SeqData object modified:
    seqs_annot:
        + RNCMPT00001_predictions
Training DeepBind SingleTask model on RNCMPT00002
Dropping 2 sequences with NaN targets.
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.067


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 1.063


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 1.063


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 2
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 2.1 K 
3 | fcn       | BasicFullyConnectedModule | 3.5 K 
--------------------------------------------------------
5.6 K     Trainable params
0         Non-trainable params
5.6 K     Total params
0.022     Total estimated model params size (MB)


SeqData object modified:
    seqs_annot:
        + RNCMPT00002_predictions
Training DeepBind SingleTask model on RNCMPT00003
Dropping 0 sequences with NaN targets.
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.018


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 3
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 2.1 K 
3 | fcn       | BasicFullyConnectedModule | 3.5 K 
--------------------------------------------------------
5.6 K     Trainable params
0         Non-trainable params
5.6 K     Total params
0.022     Total estimated model params size (MB)


SeqData object modified:
    seqs_annot:
        + RNCMPT00003_predictions
Training DeepBind SingleTask model on RNCMPT00004
Dropping 2 sequences with NaN targets.
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.451


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 4
Global seed set to 0
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name      | Type                      | Params
--------------------------------------------------------
0 | hp_metric | R2Score                   | 0     
1 | max_pool  | MaxPool1d                 | 0     
2 | convnet   | BasicConv1D               | 2.1 K 
3 | fcn       | BasicFullyConnectedModule | 3.5 K 
--------------------------------------------------------
5.6 K     Trainable params
0         Non-trainable params
5.6 K     Total params
0.022     Total estimated model params size (MB)


SeqData object modified:
    seqs_annot:
        + RNCMPT00004_predictions
Training DeepBind SingleTask model on RNCMPT00005
Dropping 3 sequences with NaN targets.
No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 0


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.991


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

SeqData object modified:
    seqs_annot:
        + RNCMPT00005_predictions


: 

---

# Scratch

In [None]:
# Test conv kernel initialization, this needs a fix!
cnn = prep_new_model(seed=0, arch="CNN", config=os.path.join(eu.settings.config_dir, "ssCNN.yaml"))
jores = prep_new_model(seed=0, arch="Jores21CNN", config=os.path.join(eu.settings.config_dir, "Jores21CNN.yaml"))
torch.all(cnn.convnet.module[0].weight[0] == jores.biconv.kernels[0][0])