# Train a DeepSTARR architecture using EUGENe
- Assumes you have a SeqData generated

# Set-up

In [4]:
# Import modules
import os
import torch

# ML4G
import seqdata as sd

# EUGENe
from eugene.models import zoo
from eugene import models
from eugene import train

In [5]:
# Set wd
os.chdir("/cellar/users/aklie/data/datasets/deAlmeida_DrosophilaS2_UMI-STARR-seq")

In [6]:
# cuda check
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")
print(device)

cuda:0


In [7]:
# Set fold to eval on
date = "2023_12_19"
dataset = "evoaug"
model = "DeepSTARR"
target_var = "both"
train_var = "train_val"

# Set log dir
log_dir = f"models/{date}/{dataset}/{model}/{target_var}/{train_var}"
print(f"Log dir: {log_dir}")

# Set task
if target_var == "both":
    if dataset == "evoaug":
        target_var = ["target"]
    elif dataset == "seqdatasets":
        target_var = ["Dev_log2_enrichment", "Hk_log2_enrichment"]
    else:
        raise ValueError(f"Invalid dataset: {dataset}")
elif target_var == "dev":
    target_var = ["Dev_log2_enrichment"]
elif target_var == "hk":
    target_var = ["Hk_log2_enrichment"]
else:
    raise ValueError(f"Invalid target: {target_var}")

# Set transforms depending on dataset

Log dir: models/2023_12_19/evoaug/DeepSTARR/both/train_val


# Load SeqData

In [12]:
# Load SeqData
sdata = sd.open_zarr(f"training/2023_12_19/{dataset}/deAlmeida22_training.zarr")
sdata

Unnamed: 0,Array,Chunk
Bytes,1.64 GiB,3.38 MiB
Shape,"(442848, 4, 249)","(27678, 1, 32)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.64 GiB 3.38 MiB Shape (442848, 4, 249) (27678, 1, 32) Dask graph 512 chunks in 2 graph layers Data type float32 numpy.ndarray",249  4  442848,

Unnamed: 0,Array,Chunk
Bytes,1.64 GiB,3.38 MiB
Shape,"(442848, 4, 249)","(27678, 1, 32)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.38 MiB,432.47 kiB
Shape,"(442848, 2)","(110712, 1)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.38 MiB 432.47 kiB Shape (442848, 2) (110712, 1) Dask graph 8 chunks in 2 graph layers Data type float32 numpy.ndarray",2  442848,

Unnamed: 0,Array,Chunk
Bytes,3.38 MiB,432.47 kiB
Shape,"(442848, 2)","(110712, 1)"
Dask graph,8 chunks in 2 graph layers,8 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,432.47 kiB,216.23 kiB
Shape,"(442848,)","(221424,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 432.47 kiB 216.23 kiB Shape (442848,) (221424,) Dask graph 2 chunks in 2 graph layers Data type bool numpy.ndarray",442848  1,

Unnamed: 0,Array,Chunk
Bytes,432.47 kiB,216.23 kiB
Shape,"(442848,)","(221424,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray


In [13]:
test_seqs = torch.tensor(sdata["ohe_seq"][:10].values, dtype=torch.float32)
test_seqs.shape

torch.Size([10, 4, 249])

# Instantiate the model

In [14]:
# Instantiate architecture
arch = zoo.DeepSTARR(
    input_len=249,
    output_dim=2
)
arch(test_seqs).shape

torch.Size([10, 2])

In [15]:
# Instantiate a sequence module
module = models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer_lr=0.002,
    optimizer_kwargs=dict(weight_decay=1e-6),
    scheduler="reduce_lr_on_plateau",
    scheduler_monitor="val_loss_epoch",
    metric="pearson",
    seed=1234,
)

[rank: 0] Global seed set to 1234


# Train the model

In [16]:
# Train the model
trainer = train.fit_sequence_module(
    model=module,
    sdata=sdata,
    seq_var="ohe_seq",
    target_vars=target_var,
    in_memory=True,
    train_var=train_var,
    epochs=100,
    batch_size=128,
    early_stopping_monitor=f"val_loss_epoch",
    early_stopping_patience=10,
    early_stopping_mode="min",
    model_checkpoint_monitor=f"val_loss_epoch",
    model_checkpoint_k=5,
    model_checkpoint_mode="min",
    logger="csv",
    log_dir=log_dir,
    name="",
    version="",
    seed=1234,
    return_trainer=True
)

Dropping 0 sequences with NaN targets.
Loading ohe_seq and ['target'] into memory


[rank: 0] Global seed set to 1234
  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type            | Params
-------------------------------------------------
0 | arch         | DeepSTARR       | 7.7 M 
1 | train_metric | PearsonCorrCoef | 0     
2 | val_metric   | PearsonCorrCoef | 0     
3 | test_metric  | PearsonCorrCoef | 0     
-------------------------------------------------
7.7 M     Trainable params
0         Non-trainable params
7.7 M     Total params
30.745    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [17]:
# Move best model to log directory
best_model_path = trainer.checkpoint_callback.best_model_path
copy_path = os.path.join(log_dir, "best_model.ckpt")
os.system(f"cp {best_model_path} {copy_path}")

0

# DONE!

---