# GPU Utilization Analysis

In [9]:
import os

import torch
import wandb
from tqdm import tqdm
import seqdata as sd

import eugene as eu
import eugene.train
import eugene.models
from eugene import settings
settings.dataset_dir = "/cellar/users/aklie/data/eugene/revision/kopp21"
settings.output_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/output/kopp21"
settings.logging_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/logs/kopp21"
settings.config_dir = "/cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/configs/kopp21"

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mdlaub[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Load a medium sized example dataset

In [3]:
sdata = sd.open_zarr(os.path.join(settings.dataset_dir, 'kopp21_train.zarr'))
sdata

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type object numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type int64 numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type int64 numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,904.82 MiB,1.81 MiB
Shape,"(948771, 1, 500)","(29650, 1, 32)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray
"Array Chunk Bytes 904.82 MiB 1.81 MiB Shape (948771, 1, 500) (29650, 1, 32) Dask graph 512 chunks in 2 graph layers Data type uint16 numpy.ndarray",500  1  948771,

Unnamed: 0,Array,Chunk
Bytes,904.82 MiB,1.81 MiB
Shape,"(948771, 1, 500)","(29650, 1, 32)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint16 numpy.ndarray,uint16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.77 GiB,3.56 MiB
Shape,"(948771, 500, 4)","(59299, 63, 1)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 1.77 GiB 3.56 MiB Shape (948771, 500, 4) (59299, 63, 1) Dask graph 512 chunks in 2 graph layers Data type uint8 numpy.ndarray",4  500  948771,

Unnamed: 0,Array,Chunk
Bytes,1.77 GiB,3.56 MiB
Shape,"(948771, 500, 4)","(59299, 63, 1)"
Dask graph,512 chunks in 2 graph layers,512 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,452.41 MiB,1.81 MiB
Shape,"(948771, 500)","(59299, 32)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 452.41 MiB 1.81 MiB Shape (948771, 500) (59299, 32) Dask graph 256 chunks in 2 graph layers Data type |S1 numpy.ndarray",500  948771,

Unnamed: 0,Array,Chunk
Bytes,452.41 MiB,1.81 MiB
Shape,"(948771, 500)","(59299, 32)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.24 MiB 463.27 kiB Shape (948771,) (59299,) Dask graph 16 chunks in 2 graph layers Data type object numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,7.24 MiB,463.27 kiB
Shape,"(948771,)","(59299,)"
Dask graph,16 chunks in 2 graph layers,16 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 0.90 MiB 231.63 kiB Shape (948771,) (237193,) Dask graph 4 chunks in 2 graph layers Data type uint8 numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 0.90 MiB 231.63 kiB Shape (948771,) (237193,) Dask graph 4 chunks in 2 graph layers Data type bool numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 0.90 MiB 231.63 kiB Shape (948771,) (237193,) Dask graph 4 chunks in 2 graph layers Data type bool numpy.ndarray",948771  1,

Unnamed: 0,Array,Chunk
Bytes,0.90 MiB,231.63 kiB
Shape,"(948771,)","(237193,)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray


In [4]:
sdata[['ohe_seq', 'target']].load();

In [5]:
transforms = {
    "ohe_seq": lambda x: x.swapaxes(1, 2)
}

In [13]:
train_sdata = sdata.sel(_sequence=(sdata.train_val == True).compute())
dloader = sd.get_torch_dataloader(
    train_sdata,
    sample_dims='_sequence',
    variables=['ohe_seq', 'target'],
    transforms=transforms,
    batch_size=2048,
    pin_memory=True,
    drop_last=False,
)

## Build or choose a medium sized model

In [6]:
def prep_new_model(
    config,
    seed,
):
    # Instantiate the model
    model = eu.models.load_config(
        config_path=config,
        seed=seed
    )

    # Initialize the model prior to conv filter initialization
    eu.models.init_weights(model)

    # Return the model
    return model

In [7]:
kopp21 = prep_new_model("kopp21_cnn.yaml", seed=0).to('cuda')

[rank: 0] Global seed set to 0


## Use native PyTorch to train the model and record GPU utilization

In [90]:
def train(model, dloader, optim, loss_fn):
    model.train()
    for batch in tqdm(dloader, leave=False):
        pred = model(batch['ohe_seq'].to('cuda'))
        loss = loss_fn(pred.squeeze(), batch['target'].to('cuda'))
        loss.backward()
        optim.step()
        optim.zero_grad()

In [92]:
optim = torch.optim.Adam(kopp21.arch.parameters())

with wandb.init(project='EUGENe GPU Utilization', name='Native PyTorch'):
    epochs = 25
    for _ in tqdm(range(epochs)):
        train(kopp21.arch, dloader, optim, kopp21.loss_fxn)

  0%|          | 0/25 [00:00<?, ?it/s]
  0%|          | 0/428 [00:00<?, ?it/s][A
  0%|          | 2/428 [00:00<00:28, 15.19it/s][A
  1%|          | 5/428 [00:00<00:20, 20.59it/s][A
  2%|▏         | 8/428 [00:00<00:18, 23.07it/s][A
  3%|▎         | 11/428 [00:00<00:17, 24.40it/s][A
  3%|▎         | 14/428 [00:00<00:16, 25.47it/s][A
  4%|▍         | 17/428 [00:00<00:15, 25.85it/s][A
  5%|▍         | 20/428 [00:00<00:15, 25.94it/s][A
  5%|▌         | 23/428 [00:00<00:15, 26.36it/s][A
  6%|▌         | 26/428 [00:01<00:15, 26.01it/s][A
  7%|▋         | 29/428 [00:01<00:15, 26.38it/s][A
  7%|▋         | 32/428 [00:01<00:14, 26.64it/s][A
  8%|▊         | 35/428 [00:01<00:14, 26.87it/s][A
  9%|▉         | 38/428 [00:01<00:14, 27.13it/s][A
 10%|▉         | 41/428 [00:01<00:14, 27.42it/s][A
 10%|█         | 44/428 [00:01<00:13, 27.55it/s][A
 11%|█         | 48/428 [00:01<00:13, 28.78it/s][A
 12%|█▏        | 52/428 [00:01<00:12, 30.76it/s][A
 13%|█▎        | 56/428 [00:02<00:12,

VBox(children=(Label(value='0.003 MB of 0.034 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.083845…

## Use PyTorch Lightning to train the model and record GPU utilization

In [42]:
import xarray as xr
import pytorch_lightning as pl

In [15]:
batch_size = 2048
num_workers = 0
target_keys = ['target']
seq_key = 'ohe_seq'
in_memory = True
train_key = 'train_val'
prefetch_factor = None
drop_last = False

if target_keys is not None:
    if isinstance(target_keys, str):
        target_keys = [target_keys]
    if len(target_keys) == 1:
        sdata["target"] = sdata[target_keys[0]]
    else:
        sdata["target"] = xr.concat(
            [sdata[target_key] for target_key in target_keys], dim="_targets"
        ).transpose("_sequence", "_targets")
    nan_mask = sdata['target'].isnull()
    if sdata["target"].ndim > 1:
        nan_mask = nan_mask.any('_targets')
    print(f"Dropping {nan_mask.sum().compute().item()} sequences with NaN targets.")
if in_memory:
    print(f"Loading {seq_key} and {target_keys} into memory")
    sdata[seq_key].load()
    sdata["target"].load()

Dropping 0 sequences with NaN targets.
Loading ohe_seq and ['target'] into memory


In [17]:
train_sdata = sdata.sel(_sequence=(sdata[train_key] == True).compute())
val_sdata = sdata.sel(_sequence=(sdata[train_key] == False).compute())

In [47]:
train_dataloader = sd.get_torch_dataloader(
    train_sdata,
    sample_dims=["_sequence"],
    variables=['ohe_seq', "target"],
    batch_size=batch_size,
    num_workers=num_workers,
    prefetch_factor=prefetch_factor,
    transforms=transforms,
    shuffle=True,
    drop_last=drop_last,
)

Get train dl


In [44]:
val_dataloader = sd.get_torch_dataloader(
    val_sdata,
    sample_dims=["_sequence"],
    variables=['ohe_seq', "target"],
    batch_size=batch_size,
    num_workers=num_workers,
    prefetch_factor=prefetch_factor,
    transforms=transforms,
    shuffle=False,
    drop_last=drop_last,
)

In [49]:
trainer = pl.Trainer(
    max_epochs=25,
    logger=False,
    devices='auto',
    accelerator="gpu",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [53]:
kopp21.scheduler = None

In [54]:
kopp21 = prep_new_model("kopp21_cnn.yaml", seed=0).to('cuda')
with wandb.init(project='EUGENe GPU Utilization', name='PL no val'):
    trainer.fit(kopp21, train_dataloaders=train_dataloader)

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type        | Params
---------------------------------------------
0 | arch         | Kopp21CNN   | 743   
1 | train_metric | BinaryAUROC | 0     
2 | val_metric   | BinaryAUROC | 0     
3 | test_metric  | BinaryAUROC | 0     
---------------------------------------------
743       Trainable params
0         Non-trainable params
743       Total params
0.003     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.


## Use EUGENe to train the model and record GPU utilization

EUGENe requires a validation set to evaluate on at every epoch and conditionally reduce the learning rate.

In [10]:
kopp21 = prep_new_model("kopp21_cnn.yaml", seed=0).to('cuda')
with wandb.init(project='EUGENe GPU Utilization', name='EUGENe'):
    eu.train.fit_sequence_module(
        kopp21,
        sdata,
        gpus=1,
        seq_key="ohe_seq",
        target_keys=["target"],
        in_memory=True,
        train_key="train_val",
        epochs=25,
        batch_size=2048,
        drop_last=False,
        transforms=transforms,
        early_stopping_metric=None,
        model_checkpoint_monitor=None,
    )

Dropping 0 sequences with NaN targets.
Loading ohe_seq and ['target'] into memory
No seed set


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /cellar/users/dlaub/projects/ML4GLand/EUGENe_paper/logs/kopp21/SequenceModule
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type        | Params
---------------------------------------------
0 | arch         | Kopp21CNN   | 743   
1 | train_metric | BinaryAUROC | 0     
2 | val_metric   | BinaryAUROC | 0     
3 | test_metric  | BinaryAUROC | 0     
---------------------------------------------
743       Trainable params
0         Non-trainable params
743       Total params
0.003     Total estimated model params size (MB)
2023-06-16 16:10:20.221211: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  return F.conv1d(input, weight, bias, self.stride,
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…