# Ray et al 2013 GPU Utilization Analysis
**Authorship:**
David Laub (last updated: *07/19/2023*)
***
**Description:**
Notebook to analyze GPU utilization of the Ray et al 2013 dataset
***

In [None]:
# General imports
import os
import sys
import torch
import wandb
from tqdm.auto import tqdm
import pandas as pd
import pytorch_lightning as pl
import xarray as xr

# EUGENe imports and settings
import eugene as eu
import eugene.train
import eugene.models
import eugene.models.zoo
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/ray13"
settings.output_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/output/ray13"
settings.logging_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/logs/ray13"
settings.config_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/configs/ray13"

# EUGENe packages
import seqdata as sd

# Print versions
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"Eugene version: {eu.__version__}")
print(f"SeqData version: {sd.__version__}")
print(f"MotifData version: {md.__version__}")
print(f"WandB version: {wandb.__version__}")

In [None]:
# Login to wandb
wandb.login()

# Load the data

In [None]:
# Load data
sdata = sd.open_zarr('/cellar/users/aklie/data/eugene/revision/ray13/norm_setA_MT.zarr')
target_cols_MT = [n for n in sdata.data_vars.keys() if 'RNCMPT' in n]
sdata = (
    sdata
    .assign(
        target=xr.concat(
            sdata[target_cols_MT].data_vars.values(),
            dim=xr.DataArray(target_cols_MT, dims='_targets')
        )
    )
    .drop(target_cols_MT)
    .transpose('_sequence', '_targets', '_ohe', 'length')
)
sdata

# Load the model

In [None]:
# Define the architecture to be trained
arch = eu.models.zoo.DeepBind(
    input_len=41,  # Length of padded sequences
    output_dim=len(target_cols_MT),  # Number of multitask outputs
    conv_kwargs=dict(input_channels=4, conv_channels=[1024], conv_kernels=[16], dropout_rates=0.25, batchnorm=0.25),
    dense_kwargs=dict(hidden_dims=[512], dropout_rates=0.25, batchnorm=True),
)

# Initialize the model prior to conv filter initialization
eu.models.init_weights(arch)

# Wrap the model in a SequenceModule
model = eu.models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer="adam",
    optimizer_lr=0.0005,
).cuda()

In [None]:
# Load dataset into memory
sdata[['ohe_seq', 'target']].load();

In [None]:
# Build a dataloader
train_sdata = sdata.sel(_sequence=(sdata.train_val == True).compute())
dl = sd.get_torch_dataloader(
    train_sdata,
    sample_dims='_sequence',
    variables=['ohe_seq', 'target'],
    batch_size=2**14,
)

## PyTorch run

In [None]:
def train(model, dloader, optim, loss_fn):
    model.train()
    for batch in tqdm(dloader, position=1, leave=False):
        pred = model(batch['ohe_seq'].cuda())
        loss = loss_fn(pred.squeeze(), batch['target'].cuda())
        loss.backward()
        optim.step()
        optim.zero_grad()

In [None]:
epochs = 10
optim = torch.optim.Adam(model.parameters())
with wandb.init(project='EUGENe GPU Utilization', name='Native PyTorch', tags=['Ray13']):
    for _ in tqdm(range(epochs), position=0):
        train(model, dl, optim, model.loss_fxn)

## Lightning run

In [None]:
# Define the architecture to be trained
arch = eu.models.zoo.DeepBind(
    input_len=41,  # Length of padded sequences
    output_dim=len(target_cols_MT),  # Number of multitask outputs
    conv_kwargs=dict(input_channels=4, conv_channels=[1024], conv_kernels=[16], dropout_rates=0.25, batchnorm=0.25),
    dense_kwargs=dict(hidden_dims=[512], dropout_rates=0.25, batchnorm=True),
)

# Initialize the model prior to conv filter initialization
eu.models.init_weights(arch)

# Wrap the model in a SequenceModule
model = eu.models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer="adam",
    optimizer_lr=0.0005,
).cuda()

In [None]:
trainer = pl.Trainer(max_epochs=10, logger=False)

In [None]:
with wandb.init(project='EUGENe GPU Utilization', name='PL no val', tags=['Ray13']):
    trainer.fit(model, dl)

## EUGENe run

In [None]:
# Define the architecture to be trained
arch = eu.models.zoo.DeepBind(
    input_len=41,  # Length of padded sequences
    output_dim=len(target_cols_MT),  # Number of multitask outputs
    conv_kwargs=dict(input_channels=4, conv_channels=[1024], conv_kernels=[16], dropout_rates=0.25, batchnorm=0.25),
    dense_kwargs=dict(hidden_dims=[512], dropout_rates=0.25, batchnorm=True),
)

# Initialize the model prior to conv filter initialization
eu.models.init_weights(arch)

# Wrap the model in a SequenceModule
model = eu.models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer="adam",
    optimizer_lr=0.0005,
).cuda()

In [None]:
# Fit the model
with wandb.init(project='EUGENe GPU Utilization', name='EUGENe', tags=['Ray13']):
    eu.train.fit_sequence_module(
        model,
        sdata,
        seq_var="ohe_seq",
        target_vars=['target'],
        in_memory=True,
        train_var="train_val",
        epochs=10,
        batch_size=2**14,
        drop_last=False,
        early_stopping_metric=None,
        model_checkpoint_monitor=None,
    )

# DONE!

---

# Scratch