In [4]:
#Load the packages
import torch
import torch.nn as nn
from lightning.pytorch import Trainer #https://lightning.ai/docs/pytorch/stable/common/trainer.html
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from maldi_zsl_edit.data import MALDITOFDataModule
from maldi_zsl_edit.models import ZSLClassifier
import h5py
import numpy as np

In [5]:
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)
from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [None]:
#What to tune
batch_size = 16, 32 ,64
dim_emb = 512,788,1014
lr=1e-4,1e-6
dropout = 0.2,0.5
k_n = 3,5,7
c_n = 0,64,128

In [6]:
#What to tune
search_space = {
    "batch_size": tune.choice([16, 32, 64]),
    "emb_dim": tune.choice([512,1024,2048]),
    "lr": tune.loguniform(1e-6, 1e-1),
    "dropout": tune.choice([0.2,0.3,0.4,0.5]),
    "mlp_base" : tune.choice([512, 256]),
    "cnn_base" : tune.choice([64, 128]),
    "kernel" : tune.choice([3,5,7]),
    "cnn_hid" : tune.choice([0,64,128]),
}

In [7]:
#Crear el data set y el entrenamiento debe suceder dentro de una funcion que tengo como input el set de hyperparametros
#trainer.fit(model, dm.train_dataloader(), dm.val_dataloader()) 

def train_func(search_space):
    #tune dataset
    dm = MALDITOFDataModule( 
        "../Data/zsl_binned_new.h5t",
        zsl_mode = True, 
        split_index = 0, 
        batch_size = search_space['batch_size'], # important hyperparameter?
        n_workers = 2,
        in_memory = True, 
        )
    dm.setup(None)

    #tune model
    model = ZSLClassifier(
        mlp_kwargs = { #specify the parameters to buld the MLP ()
            'n_inputs' : 6000, #Bins of the spectra
            'emb_dim' : search_space['emb_dim'], #This is the output of the branch
            'layer_dims': [512, 256],
            'layer_or_batchnorm' : "layer",
            'dropout' : 0.2,
        },
        cnn_kwargs= { #specify the parameters to buld the CNN ()
            'vocab_size' : 6, #Number of words, in this case is 5 as (A,T,C,G,-)
            'emb_dim' : search_space['emb_dim'], #This is the output of the branch
            'conv_sizes' : [search_space['cnn_base'], search_space['cnn_base']*2], #[32, 64, 128] Out chanels of the convolutions #On the nlp mode the first is an embeding dimension
            'hidden_sizes' : [search_space['cnn_hid']], #MLP: [512, 256]. If [0] then goes directly from conv to embeding layer
            'blocks_per_stage' : 2, #How many residual blocks are applied before the pooling
            'kernel_size' : search_space['kernel'],
            #Stride?
            #Max average or non?
            'dropout' : 0.2,
            'nlp' : False #Move directly to the branch
        },
        n_classes = 160,
        t_classes = 493,
        lr=search_space['lr'], # important to tune
        weight_decay=0, # this you can keep constant
        lr_decay_factor=1.00, # this you can keep constant
        warmup_steps=250, # this you can keep constant
        #nlp = False #Try
    )

    #set train
    trainer = Trainer(
        devices="auto",
        accelerator="auto",
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
    )
    trainer.fit(model, dm.train_dataloader(), dm.val_dataloader()) 

In [8]:
# Uses hyperband for the schedule
# The maximum training epochs
num_epochs = 5
# Number of sampls from parameter space
num_samples = 10
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

In [9]:
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

scaling_config = ScalingConfig(
    num_workers=3, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 1}
)

run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="ptl/val_accuracy",
        checkpoint_score_order="max",
    ),
)

In [10]:
from ray.train.torch import TorchTrainer

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)

2024-07-19 05:55:23,304	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [14]:
def tune_zsl_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": search_space},
        tune_config=tune.TuneConfig(
            metric="val_acc",
            mode="max",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    return tuner.fit()


results = tune_zsl_asha(num_samples=num_samples)

[36m(RayTrainWorker pid=23126)[0m Setting up process group for: env:// [rank=0, world_size=3]
[36m(TorchTrainer pid=23075)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=23075)[0m - (ip=157.193.195.188, pid=23126) world_rank=0, local_rank=0, node_rank=0
[36m(TorchTrainer pid=23075)[0m - (ip=157.193.195.188, pid=23127) world_rank=1, local_rank=1, node_rank=0
[36m(TorchTrainer pid=23075)[0m - (ip=157.193.195.188, pid=23128) world_rank=2, local_rank=2, node_rank=0
[36m(RayTrainWorker pid=23126)[0m     Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
[36m(RayTrainWorker pid=23126)[0m     PyTorch no longer supports this GPU because it is too old.
[36m(RayTrainWorker pid=23126)[0m     The minimum cuda capability supported by this library is 3.7.
[36m(RayTrainWorker pid=23126)[0m     
[36m(RayTrainWorker pid=23127)[0m     
[36m(RayTrainWorker pid=23128)[0m     
2024-07-19 06:16:29,602	ERROR tune_controller.py:1331 -- Trial task failed for tr

In [15]:
results.get_best_result(metric="val_acc", mode="max")

[36m(RayTrainWorker pid=24941)[0m     Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
[36m(RayTrainWorker pid=24941)[0m     PyTorch no longer supports this GPU because it is too old.
[36m(RayTrainWorker pid=24941)[0m     The minimum cuda capability supported by this library is 3.7.
[36m(RayTrainWorker pid=24941)[0m     
[36m(RayTrainWorker pid=24940)[0m     
[36m(RayTrainWorker pid=24942)[0m     


RuntimeError: No best trial found for the given metric: val_acc. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.