# Training a neural network in PyTorch
This notebook demonstrates training a classifier in PyTorch.

In [1]:
#file handling
import zarr
import os
import dask
import pathlib

#math operators
import numpy as np
import dask.array 

#ml
import torch
import pytorch_lightning as pl
print("pl ver:", pl.__version__)

# training helpers
import mlflow.pytorch
from mlflow.tracking import MlflowClient
# import dask.distributed # sometimes breaks things
# from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
from pytorch_lightning import Trainer, seed_everything

# defined in directory (model related definitions)
import cbh_torch_lstm 
import cbh_data_definitions

from pytorch_lightning.callbacks import RichProgressBar # this progress bar works through jupyterHub on spice
import datetime

pl ver: 1.6.4


In [4]:
# import importlib
# importlib.reload(cbh_torch_lstm)

In [5]:
root_data_directory = pathlib.Path(os.environ['SCRATCH']) / 'cbh_data'

dev_data_path = root_data_directory / 'analysis_ready' / 'dev.zarr' 
training_data_path = root_data_directory / 'analysis_ready' / 'train.zarr'

In [6]:
train_input, train_labels, train_cloud_volume = cbh_data_definitions.load_data_from_zarr(training_data_path)
dev_input, dev_labels, dev_cloud_volume = cbh_data_definitions.load_data_from_zarr(dev_data_path)

Loaded zarr, file information:
 Name        : /
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : zarr.storage.DirectoryStore
No. members : 3
No. arrays  : 3
No. groups  : 0
Arrays      : cloud_volume_fraction_y.zarr, humidity_temp_pressure_x.zarr,
            : onehot_cloud_base_height_y.zarr
 

Loaded zarr, file information:
 Name        : /
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : zarr.storage.DirectoryStore
No. members : 3
No. arrays  : 3
No. groups  : 0
Arrays      : cloud_volume_fraction_y.zarr, humidity_temp_pressure_x.zarr,
            : onehot_cloud_base_height_y.zarr
 



In [7]:
LIMIT_DATA = True
if LIMIT_DATA:
    train_input= train_input[:1000000]
    train_labels =train_labels[:1000000]
    train_cloud_volume =train_cloud_volume[:1000000]
    dev_input= dev_input[:1000000]
    dev_labels= dev_labels[:1000000]
    dev_cloud_volume = dev_cloud_volume[:1000000]

## Define the network

In [8]:
# enforce reproducibility
seed_everything(42)

Global seed set to 42


42

## Perform the network initialization and training

In [9]:
height_dim = train_input.shape[1]

# define model and hyperparameters
layers = 1
input_size = train_input.shape[2] # input size is the cell input (feat dim)
output_size = 1 # for each height layer, predict one value for cloud base prob
hidden_size = 8
embed_size = 1
BILSTM = False
batch_first = True

learn_rate = 0.003

log_with_pl = True

model = cbh_torch_lstm.CloudBaseLSTM(input_size, layers, hidden_size, output_size, height_dim, embed_size, BILSTM, batch_first, lr=learn_rate)

# define training related hyperparameters

epochs = 1
max_time ="00:02:20:00" #dd:hh:mm:ss

# after training parameters defined, load datasets into dataloaders (enforce 0 as workers on sys to prevent multiple packages
# trying to parallelise while not communicating
workers_on_system = 0
collate_fn = cbh_data_definitions.dataloader_collate_with_dask
batch_size = 2500

In [10]:
train_loader, val_loader = None, None
INTO_MEMORY = True
if INTO_MEMORY:
    train_loader = cbh_data_definitions.define_data_get_loader_into_memory(train_input, train_cloud_volume, train_labels, 
                                                                  batch_size=batch_size, 
                                                                  shuffle=True, 
                                                                  num_workers = workers_on_system, 
                                                                  collate_fn=collate_fn)
    val_loader = cbh_data_definitions.define_data_get_loader_into_memory(dev_input, dev_cloud_volume, dev_labels, 
                                                                  batch_size=batch_size, 
                                                                  shuffle=False, 
                                                                  num_workers = workers_on_system, 
                                                                  collate_fn=collate_fn)
else:
    train_loader = cbh_data_definitions.define_data_get_loader(train_input, train_cloud_volume, train_labels, 
                                                                  batch_size=batch_size, 
                                                                  shuffle=True, 
                                                                  num_workers = workers_on_system, 
                                                                  collate_fn=collate_fn)
    val_loader = cbh_data_definitions.define_data_get_loader(dev_input, dev_cloud_volume, dev_labels, 
                                                                  batch_size=batch_size, 
                                                                  shuffle=False, 
                                                                  num_workers = workers_on_system, 
                                                                  collate_fn=collate_fn)

init cbh label, size: (100,)
init cbh label, size: (100,)


In [14]:
# define trainer

time_for_checkpoint =  datetime.timedelta(minutes=20)
checkpoint_callback = pl.callbacks.ModelCheckpoint(train_time_interval=time_for_checkpoint)
callbacks = [checkpoint_callback, RichProgressBar()]

trainer = pl.Trainer(max_epochs = epochs, deterministic=True, check_val_every_n_epoch=1, devices="auto", accelerator="auto", max_time=max_time, enable_checkpointing=True, callbacks=callbacks)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
# setup mlflow logging

mlflow.pytorch.autolog()

In [16]:
# run the training function 
with mlflow.start_run() as run:
    trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)


  | Name             | Type             | Params
------------------------------------------------------
0 | LSTM             | LSTM             | 232   
1 | height_embedding | Embedding        | 70    
2 | loss_fn_base     | CrossEntropyLoss | 0     
3 | linearCap        | Linear           | 5.0 K 
------------------------------------------------------
5.3 K     Trainable params
0         Non-trainable params
5.3 K     Total params
0.021     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

basetarg shape: torch.Size([100])
basepred shape: torch.Size([100, 70])


Training: 0it [00:00, ?it/s]

basetarg shape: torch.Size([100])
basepred shape: torch.Size([100, 70])


Validation: 0it [00:00, ?it/s]

basetarg shape: torch.Size([100])
basepred shape: torch.Size([100, 70])


## Display and evaluate results

In [37]:
def print_auto_logged_info(r):

    tags = {k: v for k, v in r.data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in MlflowClient().list_artifacts(r.info.run_id, "model")]
    print("run_id: {}".format(r.info.run_id))
    print("artifacts: {}".format(artifacts))
    print("params: {}".format(r.data.params))
    print("metrics: {}".format(r.data.metrics))
    print("tags: {}".format(tags))

In [38]:
# display mlflow output
print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))
mlflow.end_run()

run_id: 83f2be0e141a4657934459310c39c31a
artifacts: []
params: {'epochs': '1', 'optimizer_name': 'Adam', 'lr': '0.003', 'betas': '(0.9, 0.999)', 'eps': '1e-08', 'weight_decay': '0', 'amsgrad': 'False'}
metrics: {}
tags: {'Mode': 'training'}


In [39]:
unique_save_str = datetime.now()
trainer.save_checkpoint('final_out_try2.ckpt')