In [4]:
#Load the packages
import torch
import torch.nn as nn
from lightning.pytorch import Trainer #https://lightning.ai/docs/pytorch/stable/common/trainer.html
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from maldi_zsl_edit.data import MALDITOFDataModule
from maldi_zsl_edit.models import ZSLClassifier
import h5py
import numpy as np

In [5]:
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)
from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [6]:
#What to tune
search_space = {
    "batch_size": tune.choice([16, 32, 64]),
    "emb_dim": tune.choice([512,1024,2048]),
    "lr": tune.loguniform(1e-6, 1e-1),
    "dropout": tune.choice([0.2,0.3,0.4,0.5]),
    "mlp_base" : tune.choice([512, 256]),
    "cnn_base" : tune.choice([64, 128]),
    "kernel" : tune.choice([3,5,7]),
    "cnn_hid" : tune.choice([0,64,128]),
}

In [7]:
#Crear el data set y el entrenamiento debe suceder dentro de una funcion que tengo como input el set de hyperparametros
#trainer.fit(model, dm.train_dataloader(), dm.val_dataloader()) 

def train_func(search_space):
    #tune dataset
    dm = MALDITOFDataModule( 
        "../Data/zsl_binned_new.h5t",
        zsl_mode = True, 
        split_index = 0, 
        batch_size = search_space['batch_size'], # important hyperparameter?
        n_workers = 2,
        in_memory = True, 
        )
    dm.setup(None)

    #tune model
    model = ZSLClassifier(
        mlp_kwargs = { #specify the parameters to buld the MLP ()
            'n_inputs' : 6000, #Bins of the spectra
            'emb_dim' : search_space['emb_dim'], #This is the output of the branch
            'layer_dims': [512, 256],
            'layer_or_batchnorm' : "layer",
            'dropout' : 0.2,
        },
        cnn_kwargs= { #specify the parameters to buld the CNN ()
            'vocab_size' : 6, #Number of words, in this case is 5 as (A,T,C,G,-)
            'emb_dim' : search_space['emb_dim'], #This is the output of the branch
            'conv_sizes' : [search_space['cnn_base'], search_space['cnn_base']*2], #[32, 64, 128] Out chanels of the convolutions #On the nlp mode the first is an embeding dimension
            'hidden_sizes' : [search_space['cnn_hid']], #MLP: [512, 256]. If [0] then goes directly from conv to embeding layer
            'blocks_per_stage' : 2, #How many residual blocks are applied before the pooling
            'kernel_size' : search_space['kernel'],
            #Stride?
            #Max average or non?
            'dropout' : 0.2,
            'nlp' : False #Move directly to the branch
        },
        n_classes = 160,
        t_classes = 493,
        lr=search_space['lr'], # important to tune
        weight_decay=0, # this you can keep constant
        lr_decay_factor=1.00, # this you can keep constant
        warmup_steps=250, # this you can keep constant
        #nlp = False #Try
    )

    #set train
    trainer = Trainer(
        devices="auto",
        accelerator="auto",
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
    )
    trainer.fit(model, dm.train_dataloader(), dm.val_dataloader()) 

In [8]:
# Uses hyperband for the schedule
# The maximum training epochs
num_epochs = 5
# Number of sampls from parameter space
num_samples = 10
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

In [9]:
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

scaling_config = ScalingConfig(
    num_workers=3, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 1}
)

run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="ptl/val_accuracy",
        checkpoint_score_order="max",
    ),
)

In [10]:
from ray.train.torch import TorchTrainer

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)

2024-07-19 05:55:23,304	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [11]:
def tune_zsl_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": search_space},
        tune_config=tune.TuneConfig(
            metric="val_acc",
            mode="max",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    return tuner.fit()


results = tune_zsl_asha(num_samples=num_samples)

0,1
Current time:,2024-07-19 05:57:04
Running for:,00:01:37.78
Memory:,4.6/62.8 GiB

Trial name,# failures,error file
TorchTrainer_bbaf2_00000,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00000_0_batch_size=16,cnn_base=64,cnn_hid=64,dropout=0.2000,emb_dim=512,kernel=5,lr=0.0195,mlp_base=512_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00001,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00001_1_batch_size=32,cnn_base=64,cnn_hid=64,dropout=0.4000,emb_dim=2048,kernel=3,lr=0.0384,mlp_base=512_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00002,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00002_2_batch_size=64,cnn_base=64,cnn_hid=64,dropout=0.5000,emb_dim=2048,kernel=5,lr=0.0236,mlp_base=512_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00003,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00003_3_batch_size=32,cnn_base=128,cnn_hid=64,dropout=0.2000,emb_dim=2048,kernel=3,lr=0.0002,mlp_base=256_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00004,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00004_4_batch_size=32,cnn_base=64,cnn_hid=64,dropout=0.4000,emb_dim=1024,kernel=7,lr=0.0409,mlp_base=256_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00005,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00005_5_batch_size=64,cnn_base=128,cnn_hid=64,dropout=0.2000,emb_dim=512,kernel=3,lr=0.0000,mlp_base=256_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00006,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00006_6_batch_size=32,cnn_base=64,cnn_hid=128,dropout=0.5000,emb_dim=512,kernel=7,lr=0.0000,mlp_base=256_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00007,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00007_7_batch_size=32,cnn_base=128,cnn_hid=0,dropout=0.4000,emb_dim=1024,kernel=5,lr=0.0111,mlp_base=256_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00008,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00008_8_batch_size=32,cnn_base=64,cnn_hid=0,dropout=0.5000,emb_dim=512,kernel=7,lr=0.0003,mlp_base=256_2024-07-19_05-55-26/error.txt"
TorchTrainer_bbaf2_00009,1,"/tmp/ray/session_2024-07-19_05-55-23_328915_19913/artifacts/2024-07-19_05-55-26/TorchTrainer_2024-07-19_05-55-23/driver_artifacts/TorchTrainer_bbaf2_00009_9_batch_size=32,cnn_base=64,cnn_hid=128,dropout=0.5000,emb_dim=2048,kernel=3,lr=0.0004,mlp_base=512_2024-07-19_05-55-26/error.txt"

Trial name,status,loc,train_loop_config/ba tch_size,train_loop_config/cn n_base,train_loop_config/cn n_hid,train_loop_config/dr opout,train_loop_config/em b_dim,train_loop_config/ke rnel,train_loop_config/lr,train_loop_config/ml p_base
TorchTrainer_bbaf2_00000,ERROR,157.193.195.188:20964,16,64,64,0.2,512,5,0.0194553,512
TorchTrainer_bbaf2_00001,ERROR,157.193.195.188:21195,32,64,64,0.4,2048,3,0.0383814,512
TorchTrainer_bbaf2_00002,ERROR,157.193.195.188:21397,64,64,64,0.5,2048,5,0.0235703,512
TorchTrainer_bbaf2_00003,ERROR,157.193.195.188:21598,32,128,64,0.2,2048,3,0.000186375,256
TorchTrainer_bbaf2_00004,ERROR,157.193.195.188:21798,32,64,64,0.4,1024,7,0.0409234,256
TorchTrainer_bbaf2_00005,ERROR,157.193.195.188:22000,64,128,64,0.2,512,3,2.1204e-05,256
TorchTrainer_bbaf2_00006,ERROR,157.193.195.188:22202,32,64,128,0.5,512,7,1.4413e-05,256
TorchTrainer_bbaf2_00007,ERROR,157.193.195.188:22403,32,128,0,0.4,1024,5,0.0110654,256
TorchTrainer_bbaf2_00008,ERROR,157.193.195.188:22604,32,64,0,0.5,512,7,0.000325633,256
TorchTrainer_bbaf2_00009,ERROR,157.193.195.188:22806,32,64,128,0.5,2048,3,0.000426403,512


[36m(RayTrainWorker pid=21032)[0m Setting up process group for: env:// [rank=0, world_size=3]
[36m(TorchTrainer pid=20964)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=20964)[0m - (ip=157.193.195.188, pid=21032) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=20964)[0m - (ip=157.193.195.188, pid=21033) world_rank=1, local_rank=1, node_rank=0
[36m(TorchTrainer pid=20964)[0m - (ip=157.193.195.188, pid=21034) world_rank=2, local_rank=2, node_rank=0
[36m(RayTrainWorker pid=21033)[0m     Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
[36m(RayTrainWorker pid=21033)[0m     PyTorch no longer supports this GPU because it is too old.
[36m(RayTrainWorker pid=21033)[0m     The minimum cuda capability supported by this library is 3.7.
[36m(RayTrainWorker pid=21033)[0m     
[36m(RayTrainWorker pid=21032)[0m     
[36m(RayTrainWorker pid=21034)[0m     
2024-07-19 05:55:36,563	ERROR tune_controller.py:1331 -- Trial task failed for tr

In [12]:
results.get_best_result(metric="ptl/val_accuracy", mode="max")

[36m(RayTrainWorker pid=22857)[0m     Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
[36m(RayTrainWorker pid=22857)[0m     PyTorch no longer supports this GPU because it is too old.
[36m(RayTrainWorker pid=22857)[0m     The minimum cuda capability supported by this library is 3.7.
[36m(RayTrainWorker pid=22857)[0m     
[36m(RayTrainWorker pid=22858)[0m     
[36m(RayTrainWorker pid=22859)[0m     


RuntimeError: No best trial found for the given metric: ptl/val_accuracy. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.