# Set-up

In [1]:
import os
import torch
import numpy as np
import pandas as pd 
import seqdata as sd
import xarray as xr
from eugene import models
from eugene.models import zoo
from eugene import train
from eugene import preprocess as pp

In [18]:
# Model log directory
log_dir = "/cellar/users/aklie/projects/ML4GLand/use_cases/ResidualBind/models/2023_12_16/DeepBind"

# Load training and validation data

In [19]:
# Load the dataset
sdata_train = sd.open_zarr("/cellar/users/aklie/data/ml4gland/pubs/koo21_gia/log_norm/rnacompete2013_train.zarr")
sdata_train["train_val"] = xr.DataArray([True] * sdata_train.dims["_sequence"], dims=["_sequence"])
sdata_valid = sd.open_zarr("/cellar/users/aklie/data/ml4gland/pubs/koo21_gia/log_norm/rnacompete2013_valid.zarr")
sdata_valid["train_val"] = xr.DataArray([False] * sdata_valid.dims["_sequence"], dims=["_sequence"])
sdata = xr.concat([sdata_train, sdata_valid], dim="_sequence", data_vars="minimal")
sdata["ohe_seq"] = sdata["inputs"]
sdata.load()

# Load model

In [20]:
# Instantiate an architecture
arch = zoo.DeepBind(
    input_len=41,
    output_dim=1,
    mode="rbp"
)

In [21]:
# Initialize the weights
models.init_weights(arch)

In [22]:
# Instantiate a sequence module
module = models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer_lr=0.001,
    scheduler="reduce_lr_on_plateau",
    scheduler_monitor="val_pearson_epoch",
    metric="pearson",
    seed=1234,
)

[rank: 0] Global seed set to 1234


In [23]:
for index in range(sdata.dims["_target"]):
    sdata["target"] = sdata["targets"][:, index]
    rbp_id = sdata["rbp_id"].values[index]
    print(rbp_id)
    break

RNCMPT00100


In [24]:
# Grab the training data for this RBP
sdata = sdata.sel(_sequence=sdata["target"].notnull())

In [25]:
# Train the model
trainer = train.fit_sequence_module(
    model=module,
    sdata=sdata,
    seq_var="ohe_seq",
    target_vars=["target"],
    in_memory=True,
    train_var="train_val",
    epochs=100,
    batch_size=100,
    early_stopping_patience=20,
    early_stopping_metric="val_pearson_epoch",
    early_stopping_mode="max",
    model_checkpoint_monitor="val_pearson_epoch",
    model_checkpoint_k=5,
    model_checkpoint_mode="max",
    logger="csv",
    log_dir=os.path.join(log_dir, rbp_id),
    transforms={"ohe_seq": lambda x: torch.tensor(x.transpose(0, 2, 1), dtype=torch.float32)},
    name="",
    version="",
    seed=1234,
    return_trainer=True
)

[rank: 0] Global seed set to 1234


Dropping 0 sequences with NaN targets.
Loading ohe_seq and ['target'] into memory


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type            | Params
-------------------------------------------------
0 | arch         | DeepBind        | 2.1 K 
1 | train_metric | PearsonCorrCoef | 0     
2 | val_metric   | PearsonCorrCoef | 0     
3 | test_metric  | PearsonCorrCoef | 0     
-------------------------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.009     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [11]:
# Move best model to log directory
print("Copying best model to log directory")
best_model_path = trainer.checkpoint_callback.best_model_path
copy_path = os.path.join(log_dir, rbp_id, "best_model.ckpt")
os.system(f"cp {best_model_path} {copy_path}")

'/cellar/users/aklie/projects/ML4GLand/use_cases/DeepBind/models/2023_12_16/RNCMPT00100/checkpoints/epoch=0-step=1083.ckpt'

# DONE!

---