# Jores et al 2021 Training 
**Authorship:**
Adam Klie, *08/11/2022*
***
**Description:**
Notebook to perform simple training of models on the Jores et al dataset.
***

In [1]:
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import os
import logging
import torch
import numpy as np
import pandas as pd
import eugene as eu

Global seed set to 13


GPU is available: True
Number of GPUs: 1
Current GPU: 0
GPUs: Quadro RTX 5000


  min_coords = np.vstack(data.min(0) for data in polygons_data).min(0)
  max_coords = np.vstack(data.max(0) for data in polygons_data).max(0)


In [5]:
eu.settings.dataset_dir = "/cellar/users/aklie/data/eugene/jores21"
eu.settings.output_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/output/jores21"
eu.settings.logging_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/logs/jores21"
eu.settings.config_dir = "/cellar/users/aklie/projects/EUGENe/EUGENe_paper/configs/jores21"
eu.settings.verbosity = logging.ERROR

# Load in the `leaf`, `proto` and `combined` `SeqData`s 

In [6]:
# Load in the preprocessed training data
sdata_leaf = eu.dl.read(os.path.join(eu.settings.dataset_dir, "leaf_processed_train.h5sd"))
sdata_proto = eu.dl.read(os.path.join(eu.settings.dataset_dir, "proto_processed_train.h5sd"))
sdata_combined = eu.dl.concat([sdata_leaf, sdata_proto], keys=["leaf", "proto"])
sdata_leaf, sdata_proto, sdata_combined

(SeqData object with = 65004 seqs
 seqs = (65004,)
 names = (65004,)
 rev_seqs = (65004,)
 ohe_seqs = (65004, 170, 4)
 ohe_rev_seqs = (65004, 170, 4)
 seqs_annot: 'GC', 'barcodes', 'batch', 'chromosome', 'end', 'enrichment', 'gene', 'mutations', 'set', 'sp', 'start', 'strand', 'train_val', 'type'
 pos_annot: None
 seqsm: None
 uns: None,
 SeqData object with = 68213 seqs
 seqs = (68213,)
 names = (68213,)
 rev_seqs = (68213,)
 ohe_seqs = (68213, 170, 4)
 ohe_rev_seqs = (68213, 170, 4)
 seqs_annot: 'GC', 'barcodes', 'batch', 'chromosome', 'end', 'enrichment', 'gene', 'mutations', 'set', 'sp', 'start', 'strand', 'train_val', 'type'
 pos_annot: None
 seqsm: None
 uns: None,
 SeqData object with = 133217 seqs
 seqs = (133217,)
 names = (133217,)
 rev_seqs = (133217,)
 ohe_seqs = (133217, 170, 4)
 ohe_rev_seqs = (133217, 170, 4)
 seqs_annot: 'GC', 'barcodes', 'batch', 'chromosome', 'end', 'enrichment', 'gene', 'mutations', 'set', 'sp', 'start', 'strand', 'train_val', 'type'
 pos_annot: None

In [7]:
# Grab initialization motifs
core_promoter_elements = eu.utils.MinimalMEME(os.path.join(eu.settings.dataset_dir, 'CPEs.meme'))
tf_groups = eu.utils.MinimalMEME(os.path.join(eu.settings.dataset_dir, 'TF-clusters.meme'))
all_motifs = {**core_promoter_elements.motifs, **tf_groups.motifs}
len(all_motifs)

78

In [25]:
from pytorch_lightning import seed_everything
def prep_new_model(
    seed,
    arch,
    config
):
    # Instantiate the model
    model = eu.models.load_config(
        arch=arch,
        model_config=config
    )
    
    seed_everything(seed)
    
    # Initialize the model prior to conv filter initialization
    eu.models.base.init_weights(model)

    # Initialize the conv filters
    if arch == "Jores21CNN":
        layer_name, kernel_name, kernel_number, module_number = "biconv", "kernels", 0, None
    elif arch in ["CNN", "Hybrid"]:
        layer_name, kernel_name, kernel_number, module_number = "convnet", None, None, 0
    eu.models.init_from_motifs(
        model, 
        all_motifs, 
        layer_name=layer_name,
        kernel_name=kernel_name,
        kernel_number=kernel_number,
        module_number=module_number,
    )

    # Return the model
    return model 

In [26]:
prep_new_model(0, "Hybrid", os.path.join(eu.settings.config_dir, "ssHybrid.yaml"))

Global seed set to 0


Hybrid(
  (hp_metric): R2Score()
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 256, kernel_size=(13,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Dropout(p=0.3, inplace=False)
      (4): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): Conv1d(256, 256, kernel_size=(13,), stride=(1,))
      (6): ReLU()
      (7): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (8): Dropout(p=0.3, inplace=False)
      (9): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): Conv1d(256, 256, kernel_size=(13,), stride=(1,))
      (11): ReLU()
      (12): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (13): Dropout(p=0.3, inplace=False)
      (14): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (recurren

# Train leaf models

In [None]:
# Train 5 models with 5 different random initializations
#model_types = ["CNN", "Hybrid", "Jores21CNN"]
#model_names = ["ssCNN", "ssHybrid", "Jores21CNN"]
trials = 5
model_types = ["Jores21CNN"]
model_names = ["Jores21CNN"]
for model_name, model_type in zip(model_names, model_types):
    for trial in range(2, trials+1):
        print(f"{model_name} trial {trial}")

        # Initialize the model
        leaf_model = prep_new_model(
            arch=model_type, 
            config=os.path.join(eu.settings.config_dir, f"{model_name}.yaml"),
            seed=trial
        )
        # Train the model
        eu.train.fit(
            model=leaf_model, 
            sdata=sdata_leaf, 
            gpus=1, 
            target="enrichment",
            train_key="train_val",
            epochs=25,
            batch_size=128,
            num_workers=0,
            name=model_name,
            seed=trial,
            version=f"leaf_trial_{trial}",
            verbosity=logging.ERROR
        )
        # Get predictions on the training data
        eu.settings.dl_num_workers = 0
        eu.predict.train_val_predictions(
            leaf_model,
            sdata=sdata_leaf, 
            target="enrichment",
            train_key="train_val",
            name=model_name,
            version=f"leaf_trial_{trial}",
            prefix=f"{model_name}_trial_{trial}_"
        )
        del leaf_model
sdata_leaf.write_h5sd(os.path.join(eu.settings.output_dir, "leaf_train_predictions.h5sd"))

Jores21CNN trial 2


Global seed set to 2
Global seed set to 2


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type        | Params
------------------------------------------
0 | hp_metric | R2Score     | 0     
1 | biconv    | BiConv1D    | 1.7 M 
2 | conv      | Conv1d      | 852 K 
3 | dropout   | Dropout     | 0     
4 | fc        | Linear      | 2.8 M 
5 | batchnorm | BatchNorm1d | 128   
6 | fc2       | Linear      | 65    
------------------------------------------
5.4 M     Trainable params
0         Non-trainable params
5.4 M     Total params
21.423    Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Global seed set to 2
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.663


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.351 >= min_delta = 0.0. New best score: 1.311


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.221 >= min_delta = 0.0. New best score: 1.091


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.043 >= min_delta = 0.0. New best score: 1.048


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.142 >= min_delta = 0.0. New best score: 0.906


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.897


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.897


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.891


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.012 >= min_delta = 0.0. New best score: 0.878


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.878. Signaling Trainer to stop.


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

SeqData object modified:
    seqs_annot:
        + Jores21CNN_trial_2_enrichment_predictions
Jores21CNN trial 3


Global seed set to 3
Global seed set to 3


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type        | Params
------------------------------------------
0 | hp_metric | R2Score     | 0     
1 | biconv    | BiConv1D    | 1.7 M 
2 | conv      | Conv1d      | 852 K 
3 | dropout   | Dropout     | 0     
4 | fc        | Linear      | 2.8 M 
5 | batchnorm | BatchNorm1d | 128   
6 | fc2       | Linear      | 65    
------------------------------------------
5.4 M     Trainable params
0         Non-trainable params
5.4 M     Total params
21.423    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 3


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.325


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.115 >= min_delta = 0.0. New best score: 1.210


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.088 >= min_delta = 0.0. New best score: 1.122


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.017 >= min_delta = 0.0. New best score: 1.105


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.008 >= min_delta = 0.0. New best score: 1.097


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.130 >= min_delta = 0.0. New best score: 0.968


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.029 >= min_delta = 0.0. New best score: 0.938


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.075 >= min_delta = 0.0. New best score: 0.863


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.856


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.850


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

SeqData object modified:
    seqs_annot:
        + Jores21CNN_trial_3_enrichment_predictions
Jores21CNN trial 4


Global seed set to 4
Global seed set to 4


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type        | Params
------------------------------------------
0 | hp_metric | R2Score     | 0     
1 | biconv    | BiConv1D    | 1.7 M 
2 | conv      | Conv1d      | 852 K 
3 | dropout   | Dropout     | 0     
4 | fc        | Linear      | 2.8 M 
5 | batchnorm | BatchNorm1d | 128   
6 | fc2       | Linear      | 65    
------------------------------------------
5.4 M     Trainable params
0         Non-trainable params
5.4 M     Total params
21.423    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 4


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 1.317


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.134 >= min_delta = 0.0. New best score: 1.183


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.077 >= min_delta = 0.0. New best score: 1.106


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.040 >= min_delta = 0.0. New best score: 1.066


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.021 >= min_delta = 0.0. New best score: 1.045


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.027 >= min_delta = 0.0. New best score: 1.018


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.111 >= min_delta = 0.0. New best score: 0.907


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.897


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.890


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.014 >= min_delta = 0.0. New best score: 0.876


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
GPU available: True, used: True
TPU available: False, using: 0 TPU cores


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

SeqData object modified:
    seqs_annot:
        + Jores21CNN_trial_4_enrichment_predictions
Jores21CNN trial 5


Global seed set to 5
Global seed set to 5
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


No transforms given, assuming just need to tensorize).
No transforms given, assuming just need to tensorize).


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name      | Type        | Params
------------------------------------------
0 | hp_metric | R2Score     | 0     
1 | biconv    | BiConv1D    | 1.7 M 
2 | conv      | Conv1d      | 852 K 
3 | dropout   | Dropout     | 0     
4 | fc        | Linear      | 2.8 M 
5 | batchnorm | BatchNorm1d | 128   
6 | fc2       | Linear      | 65    
------------------------------------------
5.4 M     Trainable params
0         Non-trainable params
5.4 M     Total params
21.423    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 5


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.146 >= min_delta = 0.0. New best score: 1.348


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.146 >= min_delta = 0.0. New best score: 1.202


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.983


Validating: 0it [00:00, ?it/s]

In [28]:
sdata_leaf

SeqData object with = 65004 seqs
seqs = (65004,)
names = (65004,)
rev_seqs = (65004,)
ohe_seqs = (65004, 170, 4)
ohe_rev_seqs = (65004, 170, 4)
seqs_annot: 'GC', 'barcodes', 'batch', 'chromosome', 'end', 'enrichment', 'gene', 'mutations', 'set', 'sp', 'start', 'strand', 'train_val', 'type', 'ssCNN_trial_1_enrichment_predictions', 'ssCNN_trial_2_enrichment_predictions', 'ssCNN_trial_3_enrichment_predictions', 'ssCNN_trial_4_enrichment_predictions', 'ssCNN_trial_5_enrichment_predictions', 'ssHybrid_trial_1_enrichment_predictions', 'ssHybrid_trial_2_enrichment_predictions', 'ssHybrid_trial_3_enrichment_predictions', 'ssHybrid_trial_4_enrichment_predictions', 'ssHybrid_trial_5_enrichment_predictions', 'Jores21CNN_trial_1_enrichment_predictions'
pos_annot: None
seqsm: None
uns: None

# Train proto models

In [None]:
# Train 5 models with 5 different random initializations
model_types = ["CNN", "Jores21CNN"]
model_names = ["ssCNN", "Jores21CNN"]
sdata_proto_sub = sdata_proto[:100]
trials = 5
for model_name, model_type in zip(model_names, model_types):
    for trial in range(1, trials+1):
        print(f"{model_name} trial {trial}")

        # Initialize the model
        proto_model = prep_new_model(
            arch=model_type, 
            config=os.path.join(eu.settings.config_dir, f"{model_name}.yaml"),
            seed=13
        )
        # Train the model
        eu.train.fit(
            model=proto_model, 
            sdata=sdata_proto_sub, 
            #gpus=1, 
            target="enrichment",
            train_key="train_val",
            #epochs=25,
            epochs=1,
            name=model_name,
            version=f"test_proto_trial_{trial}",
            seed=trial,
            verbosity=logging.ERROR
        )
        # Get predictions on the training data
        eu.predict.train_val_predictions(
            proto_model,
            sdata=sdata_proto_sub, 
            target="enrichment",
            train_key="train_val",
            name=model_name,
            version=f"test_proto_trial_{trial}",
            prefix=f"{model_name}_test_trial_{trial}_"
        )
        del proto_model
    sdata_proto_sub.write_h5sd(os.path.join(eu.settings.output_dir, "proto_train_sub_predictions.h5sd"))

# Train combined models

In [None]:
# Train 5 models with 5 different random initializations
model_types = ["CNN", "Jores21CNN"]
model_names = ["ssCNN", "Jores21CNN"]
sdata_combined_sub = sdata_combined[:100]
trials = 5
for model_name, model_type in zip(model_names, model_types):
    for trial in range(1, trials+1):
        print(f"{model_name} trial {trial}")

        # Initialize the model
        combined_model = prep_new_model(
            arch=model_type, 
            config=os.path.join(eu.settings.config_dir, f"{model_name}.yaml"),
            seed=13
        )
        # Train the model
        eu.train.fit(
            model=combined_model, 
            sdata=sdata_combined_sub, 
            #gpus=1, 
            target="enrichment",
            train_key="train_val",
            #epochs=25,
            epochs=1,
            name=model_name,
            version=f"test_combined_trial_{trial}",
            seed=trial,
            verbosity=logging.ERROR
        )
        # Get predictions on the training data
        eu.predict.train_val_predictions(
            combined_model,
            sdata=sdata_combined_sub, 
            target="enrichment",
            train_key="train_val",
            name=model_name,
            version=f"test_combined_trial_{trial}",
            prefix=f"{model_name}_test_trial_{trial}_"
        )
        del combined_model
sdata_combined_sub.write_h5sd(os.path.join(eu.settings.output_dir, "combined_train_sub_predictions.h5sd"))

---

# Scratch

In [None]:
# Test conv kernel initialization, this needs a fix!
cnn = prep_new_model(seed=0, arch="CNN", config=os.path.join(eu.settings.config_dir, "ssCNN.yaml"))
jores = prep_new_model(seed=0, arch="Jores21CNN", config=os.path.join(eu.settings.config_dir, "Jores21CNN.yaml"))
torch.all(cnn.convnet.module[0].weight[0] == jores.biconv.kernels[0][0])