In [76]:
import os

import torch
import wandb
from tqdm.auto import tqdm
import pandas as pd
import pytorch_lightning as pl
import xarray as xr

import eugene as eu
import eugene.train
import eugene.models
import eugene.models.zoo
from eugene import settings
import seqdata as sd

In [4]:
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/ray13"
settings.output_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/output/ray13"
settings.logging_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/logs/ray13"
settings.config_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/configs/ray13"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdlaub[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [95]:
sdata = sd.open_zarr('/cellar/users/aklie/data/eugene/revision/ray13/norm_setA_MT.zarr')
target_cols_MT = [n for n in sdata.data_vars.keys() if 'RNCMPT' in n]
sdata = (
    sdata
    .assign(
        target=xr.concat(
            sdata[target_cols_MT].data_vars.values(),
            dim=xr.DataArray(target_cols_MT, dims='_targets')
        )
    )
    .drop(target_cols_MT)
    .transpose('_sequence', '_targets', '_ohe', 'length')
)
sdata

Unnamed: 0,Array,Chunk
Bytes,864.41 kiB,216.11 kiB
Shape,"(110645,)","(27662,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 864.41 kiB 216.11 kiB Shape (110645,) (27662,) Dask graph 4 chunks in 2 graph layers Data type object numpy.ndarray",110645  1,

Unnamed: 0,Array,Chunk
Bytes,864.41 kiB,216.11 kiB
Shape,"(110645,)","(27662,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,864.41 kiB,216.11 kiB
Shape,"(110645,)","(27662,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 864.41 kiB 216.11 kiB Shape (110645,) (27662,) Dask graph 4 chunks in 2 graph layers Data type object numpy.ndarray",110645  1,

Unnamed: 0,Array,Chunk
Bytes,864.41 kiB,216.11 kiB
Shape,"(110645,)","(27662,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,864.41 kiB,216.11 kiB
Shape,"(110645,)","(27662,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 864.41 kiB 216.11 kiB Shape (110645,) (27662,) Dask graph 4 chunks in 2 graph layers Data type object numpy.ndarray",110645  1,

Unnamed: 0,Array,Chunk
Bytes,864.41 kiB,216.11 kiB
Shape,"(110645,)","(27662,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,34.61 MiB,594.30 kiB
Shape,"(110645, 4, 41)","(27662, 1, 11)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float16 numpy.ndarray,float16 numpy.ndarray
"Array Chunk Bytes 34.61 MiB 594.30 kiB Shape (110645, 4, 41) (27662, 1, 11) Dask graph 64 chunks in 2 graph layers Data type float16 numpy.ndarray",41  4  110645,

Unnamed: 0,Array,Chunk
Bytes,34.61 MiB,594.30 kiB
Shape,"(110645, 4, 41)","(27662, 1, 11)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float16 numpy.ndarray,float16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,108.05 kiB,108.05 kiB
Shape,"(110645,)","(110645,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 108.05 kiB 108.05 kiB Shape (110645,) (110645,) Dask graph 1 chunks in 2 graph layers Data type bool numpy.ndarray",110645  1,

Unnamed: 0,Array,Chunk
Bytes,108.05 kiB,108.05 kiB
Shape,"(110645,)","(110645,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,196.69 MiB,216.11 kiB
Shape,"(110645, 233)","(27662, 1)"
Dask graph,932 chunks in 701 graph layers,932 chunks in 701 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 196.69 MiB 216.11 kiB Shape (110645, 233) (27662, 1) Dask graph 932 chunks in 701 graph layers Data type float64 numpy.ndarray",233  110645,

Unnamed: 0,Array,Chunk
Bytes,196.69 MiB,216.11 kiB
Shape,"(110645, 233)","(27662, 1)"
Dask graph,932 chunks in 701 graph layers,932 chunks in 701 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [68]:
# Define the architecture to be trained
arch = eu.models.zoo.DeepBind(
    input_len=41,  # Length of padded sequences
    output_dim=len(target_cols_MT),  # Number of multitask outputs
    conv_kwargs=dict(input_channels=4, conv_channels=[1024], conv_kernels=[16], dropout_rates=0.25, batchnorm=0.25),
    dense_kwargs=dict(hidden_dims=[512], dropout_rates=0.25, batchnorm=True),
)

# Initialize the model prior to conv filter initialization
eu.models.init_weights(arch)

# Wrap the model in a SequenceModule
model = eu.models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer="adam",
    optimizer_lr=0.0005,
).cuda()

In [96]:
sdata[['ohe_seq', 'target']].load();

In [97]:
train_sdata = sdata.sel(_sequence=(sdata.train_val == True).compute())
dl = sd.get_torch_dataloader(
    train_sdata,
    sample_dims='_sequence',
    variables=['ohe_seq', 'target'],
    batch_size=2**14,
)

### PyTorch

In [86]:
def train(model, dloader, optim, loss_fn):
    model.train()
    for batch in tqdm(dloader, position=1, leave=False):
        pred = model(batch['ohe_seq'].cuda())
        loss = loss_fn(pred.squeeze(), batch['target'].cuda())
        loss.backward()
        optim.step()
        optim.zero_grad()

In [88]:
epochs = 10
optim = torch.optim.Adam(model.parameters())
with wandb.init(project='EUGENe GPU Utilization', name='Native PyTorch', tags=['Ray13']):
    for _ in tqdm(range(epochs), position=0):
        train(model, dl, optim, model.loss_fxn)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

VBox(children=(Label(value='0.003 MB of 0.032 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.087680…

### Lightning

In [89]:
# Define the architecture to be trained
arch = eu.models.zoo.DeepBind(
    input_len=41,  # Length of padded sequences
    output_dim=len(target_cols_MT),  # Number of multitask outputs
    conv_kwargs=dict(input_channels=4, conv_channels=[1024], conv_kernels=[16], dropout_rates=0.25, batchnorm=0.25),
    dense_kwargs=dict(hidden_dims=[512], dropout_rates=0.25, batchnorm=True),
)

# Initialize the model prior to conv filter initialization
eu.models.init_weights(arch)

# Wrap the model in a SequenceModule
model = eu.models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer="adam",
    optimizer_lr=0.0005,
).cuda()

In [90]:
trainer = pl.Trainer(max_epochs=10, logger=False)

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [91]:
with wandb.init(project='EUGENe GPU Utilization', name='PL no val', tags=['Ray13']):
    trainer.fit(model, dl)

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type     | Params
------------------------------------------
0 | arch         | DeepBind | 1.2 M 
1 | train_metric | R2Score  | 0     
2 | val_metric   | R2Score  | 0     
3 | test_metric  | R2Score  | 0     
------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.953     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

### EUGENe

In [101]:
# Define the architecture to be trained
arch = eu.models.zoo.DeepBind(
    input_len=41,  # Length of padded sequences
    output_dim=len(target_cols_MT),  # Number of multitask outputs
    conv_kwargs=dict(input_channels=4, conv_channels=[1024], conv_kernels=[16], dropout_rates=0.25, batchnorm=0.25),
    dense_kwargs=dict(hidden_dims=[512], dropout_rates=0.25, batchnorm=True),
)

# Initialize the model prior to conv filter initialization
eu.models.init_weights(arch)

# Wrap the model in a SequenceModule
model = eu.models.SequenceModule(
    arch=arch,
    task="regression",
    loss_fxn="mse",
    optimizer="adam",
    optimizer_lr=0.0005,
).cuda()

In [103]:
# Fit the model
# with wandb.init(project='EUGENe GPU Utilization', name='EUGENe', tags=['Ray13']):
eu.train.fit_sequence_module(
    model,
    sdata,
    seq_key="ohe_seq",
    target_keys=['target'],
    in_memory=True,
    train_key="train_val",
    epochs=10,
    batch_size=2**14,
    drop_last=False,
    early_stopping_metric=None,
    model_checkpoint_monitor=None,
)

Dropping 0 sequences with NaN targets.
Loading ohe_seq and ['target'] into memory
No seed set


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type     | Params
------------------------------------------
0 | arch         | DeepBind | 1.2 M 
1 | train_metric | R2Score  | 0     
2 | val_metric   | R2Score  | 0     
3 | test_metric  | R2Score  | 0     
------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.953     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
