Make initializations

In [1]:
import cbh_data_definitions
import pathlib
import os

import optuna
import pytorch_lightning as pl
import mlflow
from ray import tune
import ray
import ray.tune
import ray.tune.search
import ray.tune.search.optuna
from ray.tune.search.optuna import OptunaSearch
from ray.tune.integration.mlflow import mlflow_mixin
from ray.tune.search import ConcurrencyLimiter
from pytorch_lightning.callbacks import (
    RichProgressBar,
)
from ray.tune.integration.pytorch_lightning import TuneReportCallback
import numpy as np
import datetime
import cbh_torch_MLP

In [2]:
# intialize some settings: mlflow, data directory, resources
root_data_directory = pathlib.Path(os.environ["SCRATCH"]) / "cbh_data"

dev_data_path = root_data_directory / "analysis_ready" / "dev_randomized.zarr"
training_data_path = root_data_directory / "analysis_ready" / "train_randomized.zarr"

mlflow_command_line_run = """
    mlflow server --port 5001 --backend-store-uri sqlite:///mlflowSQLserver.db  --default-artifact-root ./mlflow_artifacts/
"""
mlflow_server_address = 'vld425'
mlflow_server_port = 5001
mlflow_server_uri = f'http://{mlflow_server_address}:{mlflow_server_port:d}'
mlflow_artifact_root = pathlib.Path('./mlflow_artifacts/')

hparams_for_mlflow = {}

redefine data

In [3]:
# init data
(
    train_input,
    train_labels,
    _,
) = cbh_data_definitions.load_data_from_zarr(training_data_path)

(
    dev_input, 
    dev_labels, 
    _
) = cbh_data_definitions.load_data_from_zarr(dev_data_path)

# the cloud volume is not needed for the task, so isn't saved on the load
# show a chunk
train_input

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 2
No. arrays        : 2
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, humidity_temp_pressure_x.zarr
 

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 2
No. arrays        : 2
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, humidity_temp_pressure_x.zarr
 



Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,1.82 GiB
Shape,"(111820800, 70, 3)","(2329600, 70, 3)"
Count,2 Graph Layers,48 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 87.48 GiB 1.82 GiB Shape (111820800, 70, 3) (2329600, 70, 3) Count 2 Graph Layers 48 Chunks Type float32 numpy.ndarray",3  70  111820800,

Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,1.82 GiB
Shape,"(111820800, 70, 3)","(2329600, 70, 3)"
Count,2 Graph Layers,48 Chunks
Type,float32,numpy.ndarray


In [4]:
# limit the data by a factor for less data in a tuning trial
factors_of_chunk = [n for n in range(1, train_input.chunksize[0] + 1) if train_input.chunksize[0] % n == 0]
print("Factors of chunk: ", factors_of_chunk)
hparams_for_mlflow['Limited sample number'] =  -1

Factors of chunk:  [1, 2, 4, 5, 7, 8, 10, 13, 14, 16, 20, 25, 26, 28, 32, 35, 40, 50, 52, 56, 64, 65, 70, 80, 91, 100, 104, 112, 128, 130, 140, 160, 175, 182, 200, 208, 224, 256, 260, 280, 320, 325, 350, 364, 400, 416, 448, 455, 512, 520, 560, 640, 650, 700, 728, 800, 832, 896, 910, 1024, 1040, 1120, 1280, 1300, 1400, 1456, 1600, 1664, 1792, 1820, 2080, 2240, 2275, 2560, 2600, 2800, 2912, 3200, 3328, 3584, 3640, 4160, 4480, 4550, 5120, 5200, 5600, 5824, 6400, 6656, 7168, 7280, 8320, 8960, 9100, 10400, 11200, 11648, 12800, 13312, 14560, 16640, 17920, 18200, 20800, 22400, 23296, 25600, 29120, 33280, 35840, 36400, 41600, 44800, 46592, 58240, 66560, 72800, 83200, 89600, 93184, 116480, 145600, 166400, 179200, 232960, 291200, 332800, 465920, 582400, 1164800, 2329600]


setup study

In [5]:
# DEFINE ALL SETTINGS FOR TRAINING, includes hparam space
experiment_name = 'cbh-hparam-tuning'
CPU_COUNT = 12
RAM_GB = 128
hparams_for_mlflow['CPU Count'] = CPU_COUNT
hparams_for_mlflow['Compute Memory'] = RAM_GB
thread_count_for_dask = CPU_COUNT
dataset_method = '1chunk'
randomize_chunkwise_1chunk = False
shuffle_train_data = False
collate_fn = None # alt: cbh_data_definitions.dataloader_collate_with_dask
num_workers_dataloader = 0 # alt: CPU_COUNT +-
global_trail_number = 0
max_time_for_trial = "00:02:00:00"  # dd:hh:mm:ss
hparams_for_mlflow["Training timeout"] = max_time_for_trial

max_node_num_exclusive = 513
max_layers = 12
factors_for_hparam_choice = [factor for factor in factors_of_chunk if (factor<3300 and factor>3)]
mlp_search_space = {
    "epoch": 1,
    "lr": tune.quniform(0.0001, 0.01, 0.00005),
    "data_limit": 4,
    "batch_size": tune.choice(factors_for_hparam_choice),
    "arch_name":"MLP",
    "hidden_layers":tune.randint(1,max_layers),
    "activation":tune.choice(["relu", "tanh"]),
    "input_size":(train_input.shape[2] * train_input.shape[1]),
    "output_size": train_input.shape[1],
    "deterministic":False,
    "chkpt_time":datetime.timedelta(minutes=15),
    "max_time":max_time_for_trial
    
}
layer_pattern = 'layer_node_number_{layer_num}_div_8'
for layer_num in range(max_layers):
    mlp_search_space[layer_pattern.format(layer_num=layer_num)] = tune.randint(1,int(max_node_num_exclusive/8))
print(mlp_search_space)

{'epoch': 1, 'lr': <ray.tune.search.sample.Float object at 0x2b29ab1199c0>, 'data_limit': 4, 'batch_size': <ray.tune.search.sample.Categorical object at 0x2b29ab118490>, 'arch_name': 'MLP', 'hidden_layers': <ray.tune.search.sample.Integer object at 0x2b29ab118340>, 'activation': <ray.tune.search.sample.Categorical object at 0x2b29ab11ac80>, 'input_size': 210, 'output_size': 70, 'deterministic': False, 'chkpt_time': datetime.timedelta(seconds=900), 'max_time': '00:02:00:00', 'layer_node_number_0_div_8': <ray.tune.search.sample.Integer object at 0x2b29ab11aa70>, 'layer_node_number_1_div_8': <ray.tune.search.sample.Integer object at 0x2b29ab11a9e0>, 'layer_node_number_2_div_8': <ray.tune.search.sample.Integer object at 0x2b29ab11a950>, 'layer_node_number_3_div_8': <ray.tune.search.sample.Integer object at 0x2b29ab11a440>, 'layer_node_number_4_div_8': <ray.tune.search.sample.Integer object at 0x2b29ab11a3b0>, 'layer_node_number_5_div_8': <ray.tune.search.sample.Integer object at 0x2b29ab11

In [6]:
class MLFlowLogger(pl.loggers.MLFlowLogger): #overwrite mlflogger
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def after_save_checkpoint(self, model_checkpoint: pl.callbacks.ModelCheckpoint) -> None:
        """
        Called after model checkpoint callback saves a new checkpoint.
        """
        best_chkpt = torch.load(model_checkpoint.best_model_path)
        checkpoint_for_mlflow = {
            "val loss": float(best_chkpt['callbacks'][list(key for key in list(best_chkpt['callbacks'].keys()) if "ModelCheckpoint" in key)[0]]['current_score']),
            "train loss at step-1": list(train_loss_metric.value for train_loss_metric in mlf_logger._mlflow_client.get_metric_history(run.info.run_id, "Train loss") if (int(train_loss_metric.step) == int(best_chkpt['global_step']-1)))[0],
            "global_step": best_chkpt['global_step'],
            "model_state_dict": best_chkpt['state_dict'],
            "checkpoint": best_chkpt,
        }
        with TemporaryDirectory() as tmpdirname:
            f_name = os.path.join(tmpdirname, f"{run.info.run_id}-best_model_checkpoint-step_{best_chkpt['global_step']}.pt")
            torch.save(checkpoint_for_mlflow, f_name)
            mlflow.log_artifact(f_name)

In [7]:
verbose_objective = False
# @mlflow_mixin
def objective(ray_config):
    
    mlflow.set_tracking_uri(mlflow_server_uri)
    # make vars global
    mlf_exp = None
    mlf_exp_id = None
    try: 
        if verbose_objective: print('Creating experiment')
        mlf_exp_id = mlflow.create_experiment(experiment_name)
        mlf_exp = mlflow.get_experiment(mlf_exp_id)
    except mlflow.exceptions.RestException as e:
        if verbose_objective: print("Caught")
        if False:
            print(e)
        mlf_exp = mlflow.get_experiment_by_name(experiment_name)
    if verbose_objective: print("Success")
    
    datamodule = cbh_data_definitions.CBH_DataModule(
        train_input, train_labels,
        dev_input, dev_labels,
        thread_count_for_dask,
        ray_config['batch_size'],
        num_workers = num_workers_dataloader,
        collate_fn = collate_fn,
        shuffle = shuffle_train_data,
        randomize_chunkwise = randomize_chunkwise_1chunk,
        method=dataset_method,
    )
    #def model
    ff_nodes_strings = []
    for key in ray_config:
        if key.startswith("layer_node_number_"):
            ff_nodes_strings.append(key)
    ff_nodes_strings = sorted(ff_nodes_strings)
    ff_nodes = [(8*ray_config[ff_node_num]) for ff_node_num in ff_nodes_strings]
    if verbose_objective: print(ray_config['hidden_layers'])
    if verbose_objective: print(ff_nodes)
    model = cbh_torch_MLP.CloudBaseMLP(
        ray_config['input_size'],
        ff_nodes,
        ray_config['output_size'],
        ray_config['hidden_layers'],
        ray_config['activation'],
        ray_config['lr'],
    )                                       
    if verbose_objective: print("Finished model init")
    timestamp_template = '{dt.year:04d}{dt.month:02d}{dt.day:02d}T{dt.hour:02d}{dt.minute:02d}{dt.second:02d}'
    run_name_template = 'cbh_challenge_{network_name}_' + timestamp_template
    current_run_name = run_name_template.format(network_name=model.__class__.__name__,
                                                    dt=datetime.datetime.now()
                                                   )
    # begin mlflow experiment run
    with mlflow.start_run(experiment_id=mlf_exp.experiment_id, run_name=current_run_name) as run:
        mlflow.pytorch.autolog()
        mlf_logger = MLFlowLogger(experiment_name=experiment_name, tracking_uri=mlflow_server_uri, run_id=run.info.run_id)
        if verbose_objective: print("Finished init logger")
        # define trainer
        time_for_checkpoint = ray_config['chkpt_time']
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            train_time_interval=time_for_checkpoint,
            dirpath=run.info.artifact_uri,
            monitor="val_loss_mean",
            save_on_train_epoch_end=False,
            mode="min"
        )
        callbacks = [checkpoint_callback, RichProgressBar(), TuneReportCallback(
            {"val_loss_mean": "val_loss_mean",},
            on="validation_epoch_end"
            )
        ]
        if verbose_objective: print("Finished define callbacks")
        trainer_hparams = {
            'max_epochs':ray_config['epoch'],
            'deterministic':ray_config['deterministic'],
            'val_check_interval':0.05, # val every percentage of the epoch or an INT for after a number of batches
            'devices':"auto",
            'accelerator':"auto",
            'max_time':ray_config['max_time'],
            'replace_sampler_ddp':False,
            'enable_checkpointing':True,
            'strategy':None,
            'callbacks':callbacks,
            'logger':mlf_logger,
        }
        if verbose_objective: print("Finished init hparams kwargs")
        hparams_for_mlflow['ray_config'] = ray_config
        mlf_logger.log_hyperparams(hparams_for_mlflow)
        if verbose_objective: print("Finished log hparams mlflow")
        if verbose_objective: print(trainer_hparams)
        trainer = pl.Trainer(
            **trainer_hparams
        )
        if verbose_objective: print("REACH all init before fit")
        trainer.fit(model=model, datamodule=datamodule)
        path_to_save = '{dt.year:04d}{dt.month:02d}{dt.day:02d}-{dt.hour:02d}{dt.minute:02d}{dt.second:02d}'.format(dt=datetime.datetime.now())
        trainer.save_checkpoint(filepath=run.info.artifact_uri + f'/post_epoch_modelchkpt_{path_to_save}')

In [8]:
searcher = OptunaSearch(metric=["val_loss_mean"], mode=["min"])
algo = ConcurrencyLimiter(searcher, max_concurrent=int(CPU_COUNT*(3/4)))
num_hparam_trials = 50
# mlp_search_space["mlflow"] = {
#     "tracking_uri":mlflow_server_uri,
#     "experiment_id":mlf_exp_id,
#     "experiment_name":experiment_name,
# }
tuner = tune.Tuner(
    objective,
    tune_config=tune.TuneConfig(
        search_alg=algo,
        num_samples=num_hparam_trials,
    ),
    param_space=mlp_search_space,
)
results = tuner.fit()

2022-11-18 12:10:13,356	INFO worker.py:1528 -- Started a local Ray instance.
  return ot.distributions.DiscreteUniformDistribution(
  return ot.distributions.IntUniformDistribution(
[32m[I 2022-11-18 12:10:22,279][0m A new study created in memory with name: optuna[0m


0,1
Current time:,2022-11-18 12:29:32
Running for:,00:19:10.05
Memory:,48.8/251.8 GiB

Trial name,status,loc,activation,batch_size,hidden_layers,layer_node_number_0_ div_8,layer_node_number_10 _div_8,layer_node_number_11 _div_8,layer_node_number_1_ div_8,layer_node_number_2_ div_8,layer_node_number_3_ div_8,layer_node_number_4_ div_8,layer_node_number_5_ div_8,layer_node_number_6_ div_8,layer_node_number_7_ div_8,layer_node_number_8_ div_8,layer_node_number_9_ div_8,lr
objective_fa002b54,RUNNING,10.154.1.77:36425,tanh,832,8,35,14,57,56,6,62,52,48,56,4,47,53,0.0004
objective_fd8ede3c,RUNNING,10.154.1.77:37020,tanh,130,10,30,6,53,31,11,53,62,51,17,50,51,21,0.0062
objective_0108445e,RUNNING,10.154.1.77:37276,tanh,7,5,63,40,59,48,61,60,17,24,40,55,50,5,0.0084
objective_0461fd7a,RUNNING,10.154.1.77:37422,relu,1600,11,48,55,13,22,7,45,18,10,51,22,21,52,0.00905
objective_07d5754a,RUNNING,10.154.1.77:37510,relu,3200,4,41,56,36,7,38,12,21,49,32,39,35,9,0.0099
objective_0b794186,RUNNING,10.154.1.77:37755,tanh,13,4,8,36,48,19,31,49,8,52,36,43,34,63,0.002
objective_0f41da1c,RUNNING,10.154.1.77:38620,tanh,140,7,20,61,59,34,25,28,49,21,28,32,2,37,0.0002
objective_13fcf4a6,RUNNING,10.154.1.77:39023,tanh,130,3,18,60,27,5,12,40,16,37,8,55,13,45,0.00895
objective_183b3c44,RUNNING,10.154.1.77:39109,tanh,112,6,5,21,1,14,60,49,14,25,35,20,36,9,0.00795


[2m[36m(objective pid=36425)[0m GPU available: False, used: False
[2m[36m(objective pid=36425)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=36425)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=36425)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=36425)[0m   rank_zero_deprecation(
[2m[36m(objective pid=36425)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=36425)[0m   rank_zero_deprecation(
[2m[36m(objective pid=36425)[0m   rank_zero_deprecation(
[2m[36m(objective pid=36425)[0m   rank_zero_deprecation(
[2m[36m(objective pid=36425)[0m   rank_zero_deprecation(


[2m[36m(objective pid=36425)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=36425)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=36425)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=36425)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=36425)[0m │ 1 │ linears           │ ModuleList       │  786 K │
[2m[36m(objective pid=36425)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=36425)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=36425)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=36425)[0m Trainable params: 786 K                                                         
[2m[36m(objective pid=36425)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=36425)[0m Total params: 786 K  

[2m[36m(objective pid=36425)[0m   rank_zero_warn(
[2m[36m(objective pid=36425)[0m   rank_zero_warn(
[2m[36m(objective pid=37020)[0m GPU available: False, used: False
[2m[36m(objective pid=37020)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=37020)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=37020)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=37020)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37020)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=37020)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37020)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37020)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37020)[0m   rank_zero_deprecation(


[2m[36m(objective pid=37020)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=37020)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=37020)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=37020)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=37020)[0m │ 1 │ linears           │ ModuleList       │  800 K │
[2m[36m(objective pid=37020)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=37020)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=37020)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=37020)[0m Trainable params: 800 K                                                         
[2m[36m(objective pid=37020)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=37020)[0m Total params: 800 K  

[2m[36m(objective pid=37020)[0m   rank_zero_warn(
[2m[36m(objective pid=37020)[0m   rank_zero_warn(
[2m[36m(objective pid=37276)[0m GPU available: False, used: False
[2m[36m(objective pid=37276)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=37276)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=37276)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=37276)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37276)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=37276)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37276)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37276)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37276)[0m   rank_zero_deprecation(


[2m[36m(objective pid=37276)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=37276)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=37276)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=37276)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=37276)[0m │ 1 │ linears           │ ModuleList       │  823 K │
[2m[36m(objective pid=37276)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=37276)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=37276)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=37276)[0m Trainable params: 823 K                                                         
[2m[36m(objective pid=37276)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=37276)[0m Total params: 823 K  

[2m[36m(objective pid=37276)[0m   rank_zero_warn(
[2m[36m(objective pid=37276)[0m   rank_zero_warn(
[2m[36m(objective pid=37422)[0m GPU available: False, used: False
[2m[36m(objective pid=37422)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=37422)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=37422)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=37422)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37422)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=37422)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37422)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37422)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37422)[0m   rank_zero_deprecation(


[2m[36m(objective pid=37422)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=37422)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=37422)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=37422)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=37422)[0m │ 1 │ linears           │ ModuleList       │  555 K │
[2m[36m(objective pid=37422)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=37422)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=37422)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=37422)[0m Trainable params: 555 K                                                         
[2m[36m(objective pid=37422)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=37422)[0m Total params: 555 K  

[2m[36m(objective pid=37422)[0m   rank_zero_warn(
[2m[36m(objective pid=37422)[0m   rank_zero_warn(
[2m[36m(objective pid=37510)[0m GPU available: False, used: False
[2m[36m(objective pid=37510)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=37510)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=37510)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=37510)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37510)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=37510)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37510)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37510)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37510)[0m   rank_zero_deprecation(


[2m[36m(objective pid=37510)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=37510)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=37510)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=37510)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=37510)[0m │ 1 │ linears           │ ModuleList       │  366 K │
[2m[36m(objective pid=37510)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=37510)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=37510)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=37510)[0m Trainable params: 366 K                                                         
[2m[36m(objective pid=37510)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=37510)[0m Total params: 366 K  

[2m[36m(objective pid=37510)[0m   rank_zero_warn(
[2m[36m(objective pid=37510)[0m   rank_zero_warn(
[2m[36m(objective pid=37755)[0m GPU available: False, used: False
[2m[36m(objective pid=37755)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=37755)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=37755)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=37755)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37755)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=37755)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37755)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37755)[0m   rank_zero_deprecation(
[2m[36m(objective pid=37755)[0m   rank_zero_deprecation(


[2m[36m(objective pid=37755)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=37755)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=37755)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=37755)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=37755)[0m │ 1 │ linears           │ ModuleList       │  212 K │
[2m[36m(objective pid=37755)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=37755)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=37755)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=37755)[0m Trainable params: 212 K                                                         
[2m[36m(objective pid=37755)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=37755)[0m Total params: 212 K  

[2m[36m(objective pid=37755)[0m   rank_zero_warn(
[2m[36m(objective pid=37755)[0m   rank_zero_warn(
[2m[36m(objective pid=38620)[0m GPU available: False, used: False
[2m[36m(objective pid=38620)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=38620)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=38620)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=38620)[0m   rank_zero_deprecation(
[2m[36m(objective pid=38620)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=38620)[0m   rank_zero_deprecation(
[2m[36m(objective pid=38620)[0m   rank_zero_deprecation(
[2m[36m(objective pid=38620)[0m   rank_zero_deprecation(
[2m[36m(objective pid=38620)[0m   rank_zero_deprecation(


[2m[36m(objective pid=38620)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=38620)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=38620)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=38620)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=38620)[0m │ 1 │ linears           │ ModuleList       │  687 K │
[2m[36m(objective pid=38620)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=38620)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=38620)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=38620)[0m Trainable params: 687 K                                                         
[2m[36m(objective pid=38620)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=38620)[0m Total params: 687 K  

[2m[36m(objective pid=38620)[0m   rank_zero_warn(
[2m[36m(objective pid=38620)[0m   rank_zero_warn(
[2m[36m(objective pid=39023)[0m GPU available: False, used: False
[2m[36m(objective pid=39023)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=39023)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=39023)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=39023)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39023)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=39023)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39023)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39023)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39023)[0m   rank_zero_deprecation(


[2m[36m(objective pid=39023)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=39023)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=39023)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=39023)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=39023)[0m │ 1 │ linears           │ ModuleList       │  219 K │
[2m[36m(objective pid=39023)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=39023)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=39023)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=39023)[0m Trainable params: 219 K                                                         
[2m[36m(objective pid=39023)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=39023)[0m Total params: 219 K  

[2m[36m(objective pid=39023)[0m   rank_zero_warn(
[2m[36m(objective pid=39023)[0m   rank_zero_warn(
[2m[36m(objective pid=39109)[0m GPU available: False, used: False
[2m[36m(objective pid=39109)[0m TPU available: False, using: 0 TPU cores
[2m[36m(objective pid=39109)[0m IPU available: False, using: 0 IPUs
[2m[36m(objective pid=39109)[0m HPU available: False, using: 0 HPUs
[2m[36m(objective pid=39109)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39109)[0m   rank_zero_deprecation("The `on_init_end` callback hook was deprecated in v1.6 and will be removed in v1.8.")
[2m[36m(objective pid=39109)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39109)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39109)[0m   rank_zero_deprecation(
[2m[36m(objective pid=39109)[0m   rank_zero_deprecation(


[2m[36m(objective pid=39109)[0m ┏━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┓
[2m[36m(objective pid=39109)[0m ┃   ┃ Name              ┃ Type             ┃ Params ┃
[2m[36m(objective pid=39109)[0m ┡━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━┩
[2m[36m(objective pid=39109)[0m │ 0 │ layer_norm        │ LayerNorm        │    420 │
[2m[36m(objective pid=39109)[0m │ 1 │ linears           │ ModuleList       │  287 K │
[2m[36m(objective pid=39109)[0m │ 2 │ normalize_outputs │ Softmax          │      0 │
[2m[36m(objective pid=39109)[0m │ 3 │ crossentropy_loss │ CrossEntropyLoss │      0 │
[2m[36m(objective pid=39109)[0m └───┴───────────────────┴──────────────────┴────────┘
[2m[36m(objective pid=39109)[0m Trainable params: 288 K                                                         
[2m[36m(objective pid=39109)[0m Non-trainable params: 0                                                         
[2m[36m(objective pid=39109)[0m Total params: 288 K  

[2m[36m(objective pid=39109)[0m   rank_zero_warn(
[2m[36m(objective pid=39109)[0m   rank_zero_warn(
2022-11-18 12:29:32,616	ERROR tune.py:773 -- Trials did not complete: [objective_fa002b54, objective_fd8ede3c, objective_0108445e, objective_0461fd7a, objective_07d5754a, objective_0b794186, objective_0f41da1c, objective_13fcf4a6, objective_183b3c44]
2022-11-18 12:29:32,618	INFO tune.py:777 -- Total run time: 1150.36 seconds (1150.04 seconds for the tuning loop).
*** SIGTERM received at time=1668774815 on cpu 19 ***
PC: @     0x2b28ef59e0e3  (unknown)  epoll_wait
    @     0x2b28ee981630  (unknown)  (unknown)
[2022-11-18 12:33:35,814 E 29131 29131] logging.cc:361: *** SIGTERM received at time=1668774815 on cpu 19 ***
[2022-11-18 12:33:35,814 E 29131 29131] logging.cc:361: PC: @     0x2b28ef59e0e3  (unknown)  epoll_wait
[2022-11-18 12:33:35,814 E 29131 29131] logging.cc:361:     @     0x2b28ee981630  (unknown)  (unknown)


ensure mlflow

run study

eval