Make initializations

In [67]:
import cbh_data_definitions
import pathlib
import os

import optuna
import pytorch_lightning as pl
import mlflow
from ray import tune
import numpy as np

In [11]:
# intialize some settings: mlflow, data directory, resources
root_data_directory = pathlib.Path(os.environ["SCRATCH"]) / "cbh_data"

dev_data_path = root_data_directory / "analysis_ready" / "dev_randomized.zarr"
training_data_path = root_data_directory / "analysis_ready" / "train_randomized.zarr"

mlflow_command_line_run = """
    mlflow server --port 5001 --backend-store-uri sqlite:///mlflowSQLserver.db  --default-artifact-root ./mlflow_artifacts/
"""
mlflow_server_address = 'vld425'
mlflow_server_port = 5001
mlflow_server_uri = f'http://{mlflow_server_address}:{mlflow_server_port:d}'
mlflow_artifact_root = pathlib.Path('./mlflow_artifacts/')

hparams_for_mlflow = {}

CPU_COUNT = 4
RAM_GB = 6
hparams_for_mlflow['CPU Count'] = CPU_COUNT
hparams_for_mlflow['Compute Memory'] = RAM_GB

redefine data

In [19]:
# init data
(
    train_input,
    train_labels,
    _,
) = cbh_data_definitions.load_data_from_zarr(training_data_path)

(
    dev_input, 
    dev_labels, 
    _
) = cbh_data_definitions.load_data_from_zarr(dev_data_path)

# the cloud volume is not needed for the task, so isn't saved on the load
# show a chunk
train_input

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 2
No. arrays        : 2
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, humidity_temp_pressure_x.zarr
 

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 2
No. arrays        : 2
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, humidity_temp_pressure_x.zarr
 



Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,1.82 GiB
Shape,"(111820800, 70, 3)","(2329600, 70, 3)"
Count,2 Graph Layers,48 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 87.48 GiB 1.82 GiB Shape (111820800, 70, 3) (2329600, 70, 3) Count 2 Graph Layers 48 Chunks Type float32 numpy.ndarray",3  70  111820800,

Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,1.82 GiB
Shape,"(111820800, 70, 3)","(2329600, 70, 3)"
Count,2 Graph Layers,48 Chunks
Type,float32,numpy.ndarray


In [28]:
# limit the data by a factor for less data in a tuning trial
factors_of_chunk = [n for n in range(1, train_input.chunksize[0] + 1) if train_input.chunksize[0] % n == 0]
print("Factors of chunk: ", factors_of_chunk)
hparams_for_mlflow['Limited sample number'] =  -1
factors_for_hparam_choice = [factor for factor in factors_of_chunk if (factor<3300 and factor>3)]

Factors of chunk:  [1, 2, 4, 5, 7, 8, 10, 13, 14, 16, 20, 25, 26, 28, 32, 35, 40, 50, 52, 56, 64, 65, 70, 80, 91, 100, 104, 112, 128, 130, 140, 160, 175, 182, 200, 208, 224, 256, 260, 280, 320, 325, 350, 364, 400, 416, 448, 455, 512, 520, 560, 640, 650, 700, 728, 800, 832, 896, 910, 1024, 1040, 1120, 1280, 1300, 1400, 1456, 1600, 1664, 1792, 1820, 2080, 2240, 2275, 2560, 2600, 2800, 2912, 3200, 3328, 3584, 3640, 4160, 4480, 4550, 5120, 5200, 5600, 5824, 6400, 6656, 7168, 7280, 8320, 8960, 9100, 10400, 11200, 11648, 12800, 13312, 14560, 16640, 17920, 18200, 20800, 22400, 23296, 25600, 29120, 33280, 35840, 36400, 41600, 44800, 46592, 58240, 66560, 72800, 83200, 89600, 93184, 116480, 145600, 166400, 179200, 232960, 291200, 332800, 465920, 582400, 1164800, 2329600]


setup study

In [18]:
class MLFlowLogger(pl.loggers.MLFlowLogger): #overwrite mlflogger
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def after_save_checkpoint(self, model_checkpoint: pl.callbacks.ModelCheckpoint) -> None:
        """
        Called after model checkpoint callback saves a new checkpoint.
        """
        best_chkpt = torch.load(model_checkpoint.best_model_path)
        # print(best_chkpt)
        # print(best_chkpt['callbacks'])
        checkpoint_for_mlflow = {
            "val loss": float(best_chkpt['callbacks'][list(key for key in list(best_chkpt['callbacks'].keys()) if "ModelCheckpoint" in key)[0]]['current_score']),
            "train loss at step-1": list(train_loss_metric.value for train_loss_metric in mlf_logger._mlflow_client.get_metric_history(run.info.run_id, "Train loss") if (int(train_loss_metric.step) == int(best_chkpt['global_step']-1)))[0],
            "global_step": best_chkpt['global_step'],
            "model_state_dict": best_chkpt['state_dict'],
            "checkpoint": best_chkpt,
        }
        with TemporaryDirectory() as tmpdirname:
            f_name = os.path.join(tmpdirname, f"{run.info.run_id}-best_model_checkpoint-step_{best_chkpt['global_step']}.pt")
            torch.save(checkpoint_for_mlflow, f_name)
            mlflow.log_artifact(f_name)

experiment_name = 'cbh-hparam-tuning'

mlflow.set_tracking_uri(mlflow_server_uri)
# make vars global
mlf_exp = None
mlf_exp_id = None
try: 
    print('Creating experiment')
    mlf_exp_id = mlflow.create_experiment(experiment_name)
    mlf_exp = mlflow.get_experiment(mlf_exp_id)
except mlflow.exceptions.RestException:
    mlf_exp = mlflow.get_experiment_by_name(experiment_name)
print("Success")

Creating experiment
Success


In [72]:
max_node_num_exclusive = 513
'num_workers':WORKERS_CPU_COUNT,
'pin_memory':False,
'collate_fn':collate_fn,
'thread_count_for_dask':CPU_COUNT
max_time = "00:02:00:00"  # dd:hh:mm:ss
hparams_for_mlflow["Training timeout"] = max_time

mlp_search_space = {
    "epoch": 1,
    "lr": tune.quniform(0.0001, 0.01, 0.00005),
    "data_limit": tune.randint(4, int(len(train_labels.chunks[0]) / 4)), # multiple chunk ind by chunklen: train_input.chunksize[0])
    # "activation": tune.choice(["relu", "tanh"]),
    "batch_size": tune.choice(factors_for_hparam_choice),
    "arch_name":"MLP",
    "layers":tune.randint(1,11),
    "activation":tune.choice(["relu", "tanh"]),
    "input_size":(train_input.shape[2] * train_input.shape[1]),
    "output_size": train_input.shape[1],
    "layer_node_num": tune.sample_from(lambda spec: 8*np.random.randint(1,int(max_node_num_exclusive/8), size=spec.config.layers)),
    
}
                     
                               

In [37]:
def objective(ray_config):
    # def model hparams with config
    print(ray_config)

    # def data (to ensure new data for each trial)
    train_loader = cbh_data_definitions.define_data_get_loader_1chunk(
        train_input,
        train_labels,
        shuffle=shuffle_training_data,
        **data_loader_hparam_dict
    )
    val_loader = cbh_data_definitions.define_data_get_loader_1chunk(
        dev_input,
        dev_labels,
        shuffle=False,
        **data_loader_hparam_dict
    )
    # def trainer

    timestamp_template = '{dt.year:04d}{dt.month:02d}{dt.day:02d}T{dt.hour:02d}{dt.minute:02d}{dt.second:02d}'
    run_name_template = 'cbh_challenge_{network_name}_' + timestamp_template
    current_run_name = run_name_template.format(network_name=model.__class__.__name__,
                                                    dt=datetime.datetime.now()
                                                   )

# with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
    with mlflow.start_run(experiment_id=mlf_exp.experiment_id, run_name=current_run_name) as run:

        mlflow.pytorch.autolog()
        mlf_logger = MLFlowLogger(experiment_name=experiment_name, tracking_uri=mlflow_server_uri, run_id=run.info.run_id)

        # define trainer
        time_for_checkpoint = datetime.timedelta(minutes=15)
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            train_time_interval=time_for_checkpoint,
            dirpath=run.info.artifact_uri,
            monitor="Val loss",
            save_on_train_epoch_end=False,
            mode="min"
        )
        callbacks = [checkpoint_callback, RichProgressBar()]
        
        trainer_hparams = {
            'max_epochs':epochs,
            'deterministic':True,
            'val_check_interval':0.05, # val every percentage of the data
            'devices':"auto",
            'accelerator':"auto",
            'max_time':max_time,
            'replace_sampler_ddp':False,
            'enable_checkpointing':True,
            'strategy':None,
            'callbacks':callbacks,
            'logger':mlf_logger,
        }
        
        hparams_for_mlflow["Trainer hparams"] = trainer_hparams
        mlf_logger.log_hyperparams(hparams_for_mlflow)
        
        trainer = pl.Trainer(
            **trainer_hparams
        )

        trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)
        path_to_save = '{dt.year:04d}{dt.month:02d}{dt.day:02d}-{dt.hour:02d}{dt.minute:02d}{dt.second:02d}'.format(dt=datetime.datetime.now())
        trainer.save_checkpoint(filepath=run.info.artifact_uri + f'/post_epoch_modelchkpt_{path_to_save}')
    return trainer.callback_metrics["val_acc"].item()

In [46]:
import ray
import ray.tune
import ray.tune.search
import ray.tune.search.optuna
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search import ConcurrencyLimiter
searcher = OptunaSearch(metric=["val_loss"], mode=["min"])
algo = ConcurrencyLimiter(searcher, max_concurrent=int(CPU_COUNT*(3/4)))
num_hparam_trials = 50

tuner = tune.Tuner(
    objective,
    tune_config=tune.TuneConfig(
        search_alg=algo,
        num_samples=num_hparam_trials,
    ),
    param_space=search_space
)
results = tuner.fit()

2022-11-10 13:40:32,210	INFO worker.py:1518 -- Started a local Ray instance.
  return ot.distributions.DiscreteUniformDistribution(
  return ot.distributions.IntUniformDistribution(
[32m[I 2022-11-10 13:40:33,975][0m A new study created in memory with name: optuna[0m


Trial name,status,loc,arch_choice/BILSTM,arch_choice/arch_...,arch_choice/batch...,arch_choice/embed...,arch_choice/heigh...,arch_choice/input...,arch_choice/lstm_...,arch_choice/lstm_....1,arch_choice/outpu...,batch_size,data_limit,lr
objective_4066a0a2,RUNNING,10.152.49.117:43885,False,LSTM,True,1,70,3,8,1,1,910,4,0.00485




[2m[36m(objective pid=43885)[0m {'epoch': 1, 'lr': 0.00485, 'data_limit': 4, 'batch_size': 910, 'arch_choice': {'arch_name': 'LSTM', 'input_size': 3, 'lstm_layers': 1, 'lstm_hidden_size': 8, 'output_size': 1, 'height_dimension': 70, 'embed_size': 1, 'BILSTM': False, 'batch_first': True}}


TuneError: Tune run failed. Please use tuner = Tuner.restore("/home/h02/hsouth/ray_results/objective_2022-11-10_13-40-26") to resume.

ensure mlflow

run study

eval