Make initializations

In [1]:
import cbh_data_definitions
import pathlib
import os

import optuna
import pytorch_lightning as pl
import mlflow
from ray import tune
import ray
import ray.tune
import ray.tune.search
import ray.tune.search.optuna
from ray.tune.search.optuna import OptunaSearch
from ray.tune.integration.mlflow import mlflow_mixin
from ray.tune.search import ConcurrencyLimiter
from pytorch_lightning.callbacks import (
    RichProgressBar,
)
from ray.tune.integration.pytorch_lightning import TuneReportCallback
import numpy as np
import datetime
import cbh_torch_MLP

In [2]:
# intialize some settings: mlflow, data directory, resources
root_data_directory = pathlib.Path(os.environ["SCRATCH"]) / "cbh_data"

dev_data_path = root_data_directory / "analysis_ready" / "dev_randomized.zarr"
training_data_path = root_data_directory / "analysis_ready" / "train_randomized.zarr"

mlflow_command_line_run = """
    mlflow server --port 5001 --backend-store-uri sqlite:///mlflowSQLserver.db  --default-artifact-root ./mlflow_artifacts/
"""
mlflow_server_address = 'vld425'
mlflow_server_port = 5001
mlflow_server_uri = f'http://{mlflow_server_address}:{mlflow_server_port:d}'
mlflow_artifact_root = pathlib.Path('./mlflow_artifacts/')

hparams_for_mlflow = {}

redefine data

In [3]:
# init data
(
    train_input,
    train_labels,
    _,
) = cbh_data_definitions.load_data_from_zarr(training_data_path)

(
    dev_input, 
    dev_labels, 
    _
) = cbh_data_definitions.load_data_from_zarr(dev_data_path)

# the cloud volume is not needed for the task, so isn't saved on the load
# show a chunk
train_input

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 2
No. arrays        : 2
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, humidity_temp_pressure_x.zarr
 

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 2
No. arrays        : 2
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, humidity_temp_pressure_x.zarr
 



Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,1.82 GiB
Shape,"(111820800, 70, 3)","(2329600, 70, 3)"
Count,2 Graph Layers,48 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 87.48 GiB 1.82 GiB Shape (111820800, 70, 3) (2329600, 70, 3) Count 2 Graph Layers 48 Chunks Type float32 numpy.ndarray",3  70  111820800,

Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,1.82 GiB
Shape,"(111820800, 70, 3)","(2329600, 70, 3)"
Count,2 Graph Layers,48 Chunks
Type,float32,numpy.ndarray


In [4]:
# limit the data by a factor for less data in a tuning trial
factors_of_chunk = [n for n in range(1, train_input.chunksize[0] + 1) if train_input.chunksize[0] % n == 0]
print("Factors of chunk: ", factors_of_chunk)
hparams_for_mlflow['Limited sample number'] =  -1

Factors of chunk:  [1, 2, 4, 5, 7, 8, 10, 13, 14, 16, 20, 25, 26, 28, 32, 35, 40, 50, 52, 56, 64, 65, 70, 80, 91, 100, 104, 112, 128, 130, 140, 160, 175, 182, 200, 208, 224, 256, 260, 280, 320, 325, 350, 364, 400, 416, 448, 455, 512, 520, 560, 640, 650, 700, 728, 800, 832, 896, 910, 1024, 1040, 1120, 1280, 1300, 1400, 1456, 1600, 1664, 1792, 1820, 2080, 2240, 2275, 2560, 2600, 2800, 2912, 3200, 3328, 3584, 3640, 4160, 4480, 4550, 5120, 5200, 5600, 5824, 6400, 6656, 7168, 7280, 8320, 8960, 9100, 10400, 11200, 11648, 12800, 13312, 14560, 16640, 17920, 18200, 20800, 22400, 23296, 25600, 29120, 33280, 35840, 36400, 41600, 44800, 46592, 58240, 66560, 72800, 83200, 89600, 93184, 116480, 145600, 166400, 179200, 232960, 291200, 332800, 465920, 582400, 1164800, 2329600]


setup study

In [5]:
# DEFINE ALL SETTINGS FOR TRAINING, includes hparam space
experiment_name = 'cbh-hparam-tuning'
CPU_COUNT = 8
RAM_GB = 100
hparams_for_mlflow['CPU Count'] = CPU_COUNT
hparams_for_mlflow['Compute Memory'] = RAM_GB
thread_count_for_dask = CPU_COUNT
dataset_method = '1chunk'
randomize_chunkwise_1chunk = False
shuffle_train_data = False
collate_fn = None # alt: cbh_data_definitions.dataloader_collate_with_dask
num_workers_dataloader = 0 # alt: CPU_COUNT +-
global_trail_number = 0
max_time_for_trial = "00:02:00:00"  # dd:hh:mm:ss
hparams_for_mlflow["Training timeout"] = max_time_for_trial

max_node_num_exclusive = 513
max_layers = 11
factors_for_hparam_choice = [factor for factor in factors_of_chunk if (factor<3300 and factor>3)]
mlp_search_space = {
    "epoch": 1,
    "lr": tune.quniform(0.0001, 0.01, 0.00005),
    "data_limit": 4,#tune.randint(4, int(len(train_labels.chunks[0]) / 4)), # multiple chunk ind by chunklen: train_input.chunksize[0])
    # "activation": tune.choice(["relu", "tanh"]),
    "batch_size": tune.choice(factors_for_hparam_choice),
    "arch_name":"MLP",
    "hidden_layers":tune.randint(1,max_layers),
    "activation":tune.choice(["relu", "tanh"]),
    "input_size":(train_input.shape[2] * train_input.shape[1]),
    "output_size": train_input.shape[1],
    # "layer_node_num": tune.sample_from(lambda spec: 8*np.random.randint(1,int(max_node_num_exclusive/8), size=spec.config.hidden_layers)), # DOES NOT WORK WITH OPTUNA SAMPLER
    "deterministic":False,
    "chkpt_time":datetime.timedelta(minutes=15),
    "max_time":max_time_for_trial
    
}
layer_pattern = 'layer_node_number_{layer_num}_div_8'
for layer_num in range(max_layers):
    mlp_search_space[layer_pattern.format(layer_num=layer_num)] = tune.randint(1,int(max_node_num_exclusive/8))
print(mlp_search_space)

{'epoch': 1, 'lr': <ray.tune.search.sample.Float object at 0x2b4338708d60>, 'data_limit': 4, 'batch_size': <ray.tune.search.sample.Categorical object at 0x2b4338709150>, 'arch_name': 'MLP', 'hidden_layers': <ray.tune.search.sample.Integer object at 0x2b433870b040>, 'activation': <ray.tune.search.sample.Categorical object at 0x2b433870afb0>, 'input_size': 210, 'output_size': 70, 'deterministic': False, 'chkpt_time': datetime.timedelta(seconds=900), 'max_time': '00:02:00:00', 'layer_node_number_0_div_8': <ray.tune.search.sample.Integer object at 0x2b433870ae90>, 'layer_node_number_1_div_8': <ray.tune.search.sample.Integer object at 0x2b433870ae00>, 'layer_node_number_2_div_8': <ray.tune.search.sample.Integer object at 0x2b433870ad70>, 'layer_node_number_3_div_8': <ray.tune.search.sample.Integer object at 0x2b433870a860>, 'layer_node_number_4_div_8': <ray.tune.search.sample.Integer object at 0x2b433870a7d0>, 'layer_node_number_5_div_8': <ray.tune.search.sample.Integer object at 0x2b433870

In [6]:
class MLFlowLogger(pl.loggers.MLFlowLogger): #overwrite mlflogger
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def after_save_checkpoint(self, model_checkpoint: pl.callbacks.ModelCheckpoint) -> None:
        """
        Called after model checkpoint callback saves a new checkpoint.
        """
        best_chkpt = torch.load(model_checkpoint.best_model_path)
        checkpoint_for_mlflow = {
            "val loss": float(best_chkpt['callbacks'][list(key for key in list(best_chkpt['callbacks'].keys()) if "ModelCheckpoint" in key)[0]]['current_score']),
            "train loss at step-1": list(train_loss_metric.value for train_loss_metric in mlf_logger._mlflow_client.get_metric_history(run.info.run_id, "Train loss") if (int(train_loss_metric.step) == int(best_chkpt['global_step']-1)))[0],
            "global_step": best_chkpt['global_step'],
            "model_state_dict": best_chkpt['state_dict'],
            "checkpoint": best_chkpt,
        }
        with TemporaryDirectory() as tmpdirname:
            f_name = os.path.join(tmpdirname, f"{run.info.run_id}-best_model_checkpoint-step_{best_chkpt['global_step']}.pt")
            torch.save(checkpoint_for_mlflow, f_name)
            mlflow.log_artifact(f_name)

mlflow.set_tracking_uri(mlflow_server_uri)
# make vars global
mlf_exp = None
mlf_exp_id = None
try: 
    print('Creating experiment')
    mlf_exp_id = mlflow.create_experiment(experiment_name)
    mlf_exp = mlflow.get_experiment(mlf_exp_id)
except mlflow.exceptions.RestException as e:
    print("Caught")
    if False:
        print(e)
    mlf_exp = mlflow.get_experiment_by_name(experiment_name)
print("Success")

mlp_search_space["mlflow"] = {
    "tracking_uri":mlflow_server_uri,
    "experiment_id":mlf_exp_id,
    "experiment_name":experiment_name,
    # "run_name":(experiment_name+str(datetime.datetime.now())),
}

Creating experiment
Caught
Success


In [7]:
@mlflow_mixin
def objective(ray_config):
    # def model hparams with config
    # print(ray_config)
    # print(dir(ray_config))
    # def data
    print("Test print")
    datamodule = cbh_data_definitions.CBH_DataModule(
        train_input, train_labels,
        dev_input, dev_labels,
        thread_count_for_dask,
        ray_config['batch_size'],
        num_workers = num_workers_dataloader,
        collate_fn = collate_fn,
        shuffle = shuffle_train_data,
        randomize_chunkwise = randomize_chunkwise_1chunk,
        method=dataset_method,
    )
    #def model
    ff_nodes_strings = []
    for key in ray_config:
        if key.startswith("layer_node_number_"):
            ff_nodes_strings.append(key)
    ff_nodes_strings = sorted(ff_nodes_strings)
    ff_nodes = [(8*ray_config[ff_node_num]) for ff_node_num in ff_nodes_strings]
    print(ray_config['hidden_layers'])
    print(ff_nodes)
    model = cbh_torch_MLP.CloudBaseMLP(
        ray_config['input_size'],
        ff_nodes,
        ray_config['output_size'],
        ray_config['hidden_layers'],
        ray_config['activation'],
        ray_config['lr'],
    )
    # def experiment naming
    timestamp_template = '{dt.year:04d}{dt.month:02d}{dt.day:02d}T{dt.hour:02d}{dt.minute:02d}{dt.second:02d}'
    run_name_template = 'cbh_challenge_{network_name}_' + timestamp_template
    global global_trail_number
    current_run_name = run_name_template.format(network_name=model.__class__.__name__,
                                                    dt=datetime.datetime.now()
                                                   )
    print("Finished model init")
    # begin mlflow experiment run
    with mlflow.start_run(experiment_id=mlf_exp.experiment_id, run_name=current_run_name, nested=True) as run:
        print("Started mlflow run")
        mlflow.pytorch.autolog()
        mlf_logger = MLFlowLogger(experiment_name=experiment_name, tracking_uri=mlflow_server_uri, run_id=run.info.run_id)
        print("Finished init logger")
        # define trainer
        time_for_checkpoint = ray_config['chkpt_time']
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            train_time_interval=time_for_checkpoint,
            dirpath=run.info.artifact_uri,
            monitor="val_loss_mean",
            save_on_train_epoch_end=False,
            mode="min"
        )
        callbacks = [checkpoint_callback, TuneReportCallback(on="validation_end")]
        print("Finished define callbacks")
        trainer_hparams = {
            'max_epochs':ray_config['epoch'],
            'deterministic':ray_config['deterministic'],
            'val_check_interval':0.05, # val every percentage of the epoch or an INT for after a number of batches
            'devices':"auto",
            'accelerator':"auto",
            # 'max_time':ray_config['max_time'],
            # 'replace_sampler_ddp':False,
            # 'enable_checkpointing':True,
            # 'strategy':None,
            'callbacks':callbacks,
            'logger':mlf_logger,
        }
        print("Finished init hparams kwargs")

        print("Finished log hparams mlflow")
        print(trainer_hparams)
        trainer = pl.Trainer(
            **trainer_hparams
        )
        print("REACH all init before fit")
        trainer.fit(model=model, datamodule=datamodule)
        path_to_save = '{dt.year:04d}{dt.month:02d}{dt.day:02d}-{dt.hour:02d}{dt.minute:02d}{dt.second:02d}'.format(dt=datetime.datetime.now())
        trainer.save_checkpoint(filepath=run.info.artifact_uri + f'/post_epoch_modelchkpt_{path_to_save}')

In [8]:
searcher = OptunaSearch(metric=["val_loss_mean"], mode=["min"])
algo = ConcurrencyLimiter(searcher, max_concurrent=1)#int(CPU_COUNT*(3/4)))
num_hparam_trials = 1

tuner = tune.Tuner(
    objective,
    tune_config=tune.TuneConfig(
        search_alg=algo,
        num_samples=num_hparam_trials,
    ),
    param_space=mlp_search_space,
)
results = tuner.fit()

2022-11-15 12:59:19,202	INFO worker.py:1518 -- Started a local Ray instance.
  return ot.distributions.DiscreteUniformDistribution(
  return ot.distributions.IntUniformDistribution(
[32m[I 2022-11-15 12:59:26,593][0m A new study created in memory with name: optuna[0m


Trial name,status,loc,activation,batch_size,hidden_layers,layer_node_number...,layer_node_number....1,layer_node_number....2,layer_node_number....3,layer_node_number....4,layer_node_number....5,layer_node_number....6,layer_node_number....7,layer_node_number....8,layer_node_number....9,layer_node_number....10,lr
objective_55b22346,RUNNING,10.154.1.24:42538,relu,832,1,14,46,35,60,10,52,49,34,50,47,43,0.00135




[2m[36m(objective pid=42538)[0m Test print
[2m[36m(objective pid=42538)[0m 1
[2m[36m(objective pid=42538)[0m [112, 368, 280, 480, 80, 416, 392, 272, 400, 376, 344]
[2m[36m(objective pid=42538)[0m Finished model init
[2m[36m(objective pid=42538)[0m Started mlflow run
[2m[36m(objective pid=42538)[0m Finished init logger
[2m[36m(objective pid=42538)[0m Finished define callbacks
[2m[36m(objective pid=42538)[0m Finished init hparams kwargs
[2m[36m(objective pid=42538)[0m Finished log hparams mlflow
[2m[36m(objective pid=42538)[0m {'max_epochs': 1, 'deterministic': False, 'val_check_interval': 0.05, 'devices': 'auto', 'accelerator': 'auto', 'callbacks': [<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint object at 0x2b665013fb50>, <ray.tune.integration.pytorch_lightning.TuneReportCallback object at 0x2b665016d570>], 'logger': <__main__.MLFlowLogger object at 0x2b664fb4e230>}
[2m[36m(objective pid=42538)[0m REACH all init before fit


[2m[36m(objective pid=42538)[0m GPU available: False, used: False
[2m[36m(objective pid=42538)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=42538)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=42538)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=42538)[0m   rank_zero_deprecation(
[2m[36m(objective pid=42538)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=42538)[0m   rank_zero_deprecation(
[2m[36m(objective pid=42538)[0m   rank_zero_deprecation(
[2m[36m(objective pid=42538)[0m   rank_zero_deprecation(
[2m[36m(objective pid=42538)[0m   rank_zero_deprecation(
[2m[36m(objective pid=42538)[0m 
[2m[36m(objective pid=42538)[0m   | Name              | Type             | Params
[2m[36m(objective pid=42538)[0m -------------------------------------------------------
[2m[36m(objective pid=42538)[0m 0 | layer_norm      

Sanity Checking: 0it [00:00, ?it/s]


[2m[36m(objective pid=42538)[0m   rank_zero_warn(


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Training: 0it [00:00, ?it/s]                                               
Epoch 0:   0%|          | 0/141800 [00:00<?, ?it/s] 


[2m[36m(objective pid=42538)[0m   rank_zero_warn(


Epoch 0:   0%|          | 2/141800 [00:06<127:47:15,  3.24s/it, loss=4.12, v_num=8422]
Epoch 0:   0%|          | 5/141800 [00:06<52:09:08,  1.32s/it, loss=3.93, v_num=8422] 
Epoch 0:   0%|          | 7/141800 [00:06<37:45:23,  1.04it/s, loss=3.81, v_num=8422]
Epoch 0:   0%|          | 9/141800 [00:06<29:45:55,  1.32it/s, loss=3.7, v_num=8422] 
Epoch 0:   0%|          | 12/141800 [00:06<22:47:41,  1.73it/s, loss=3.57, v_num=8422]
Epoch 0:   0%|          | 14/141800 [00:07<19:48:27,  1.99it/s, loss=3.51, v_num=8422]
Epoch 0:   0%|          | 16/141800 [00:07<17:33:19,  2.24it/s, loss=3.47, v_num=8422]
Epoch 0:   0%|          | 18/141800 [00:07<15:48:19,  2.49it/s, loss=3.43, v_num=8422]
Epoch 0:   0%|          | 21/141800 [00:07<13:48:49,  2.85it/s, loss=3.4, v_num=8422] 
Epoch 0:   0%|          | 21/141800 [00:07<13:48:51,  2.85it/s, loss=3.34, v_num=8422]
Epoch 0:   0%|          | 22/141800 [00:07<13:17:04,  2.96it/s, loss=3.3, v_num=8422] 
Epoch 0:   0%|          | 23/141800 [00:07<12



Epoch 0:   0%|          | 281/141800 [00:23<3:13:52, 12.17it/s, loss=2.99, v_num=8422]
Epoch 0:   0%|          | 283/141800 [00:23<3:13:22, 12.20it/s, loss=2.98, v_num=8422]
Epoch 0:   0%|          | 285/141800 [00:23<3:12:51, 12.23it/s, loss=2.97, v_num=8422]
Epoch 0:   0%|          | 288/141800 [00:23<3:12:01, 12.28it/s, loss=2.98, v_num=8422]
Epoch 0:   0%|          | 290/141800 [00:23<3:11:25, 12.32it/s, loss=2.98, v_num=8422]
Epoch 0:   0%|          | 292/141800 [00:23<3:10:50, 12.36it/s, loss=2.97, v_num=8422]
Epoch 0:   0%|          | 294/141800 [00:23<3:10:17, 12.39it/s, loss=2.97, v_num=8422]
Epoch 0:   0%|          | 295/141800 [00:23<3:10:02, 12.41it/s, loss=2.97, v_num=8422]
Epoch 0:   0%|          | 297/141800 [00:23<3:09:31, 12.44it/s, loss=2.97, v_num=8422]
Epoch 0:   0%|          | 299/141800 [00:23<3:09:01, 12.48it/s, loss=2.97, v_num=8422]
Epoch 0:   0%|          | 300/141800 [00:24<3:08:46, 12.49it/s, loss=2.96, v_num=8422]
Epoch 0:   0%|          | 302/141800 [00:24

KeyboardInterrupt: 

ensure mlflow

run study

eval