# Training a neural network in PyTorch
This notebook demonstrates training a classifier in PyTorch.

In [2]:
import zarr
import os
import dask
import dask.array 
import torch
import numpy as np
import pathlib

import pytorch_lightning as pl
print(pl.__version__)

1.6.4


NameError: name 'python' is not defined

In [3]:
root_data_directory = pathlib.Path(os.environ['SCRATCH']) / 'cbh_data'

dev_data_path = root_data_directory / 'analysis_ready' / 'dev.zarr' 
training_data_path = root_data_directory / 'analysis_ready' / 'train.zarr'

In [4]:
# load in the data
def load_data_from_zarr(path):
    
    store = zarr.DirectoryStore(training_data_path)
    zarr_group = zarr.group(store=store)
    print('Loaded zarr, file information:\n', zarr_group.info, '\n')
    
    x = dask.array.from_zarr(zarr_group['humidity_temp_pressure_x.zarr'])
    y_lab = dask.array.from_zarr(zarr_group['onehot_cloud_base_height_y.zarr'])
    y_cont = dask.array.from_zarr(zarr_group['cloud_volume_fraction_y.zarr'])
    
    return x, y_lab, y_cont

In [5]:
train_input, train_labels, train_cloud_volume = load_data_from_zarr(training_data_path)
dev_input, dev_labels, dev_cloud_volume = load_data_from_zarr(dev_data_path)

Loaded zarr, file information:
 Name        : /
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : zarr.storage.DirectoryStore
No. members : 3
No. arrays  : 3
No. groups  : 0
Arrays      : cloud_volume_fraction_y.zarr, humidity_temp_pressure_x.zarr,
            : onehot_cloud_base_height_y.zarr
 

Loaded zarr, file information:
 Name        : /
Type        : zarr.hierarchy.Group
Read-only   : False
Store type  : zarr.storage.DirectoryStore
No. members : 3
No. arrays  : 3
No. groups  : 0
Arrays      : cloud_volume_fraction_y.zarr, humidity_temp_pressure_x.zarr,
            : onehot_cloud_base_height_y.zarr
 



## Define the network

In [6]:
# define RNN
class CloudBaseLSTM(pl.LightningModule):
    def __init__(self, inputSize, lstmLayers, lstmHiddenSize, output_size, height_dimension, embed_size, BILSTM=True, batch_first=False, lr=2e-3, log_boolean=False):
        super().__init__()
        
        self.LSTM = torch.nn.LSTM(inputSize+embed_size, lstmHiddenSize, lstmLayers, batch_first=batch_first, bidirectional=BILSTM, proj_size=output_size)
        
        self.linearCap = torch.nn.Linear(height_dimension, height_dimension)
        
        self.batch_first = batch_first
        self.proj_size = output_size
        
        self.relu = torch.nn.ReLU()
        
        self.height_embedding = torch.nn.Embedding(height_dimension, embed_size)
        self.BILSTM = BILSTM
        self.lr = lr
        
        self.loss_fn_vol = torch.nn.MSELoss()
        self.loss_fn_base = torch.nn.CrossEntropyLoss()
        
        self.log = log_boolean
        
    def forward(self, x, height):
        
        #produce height embeds
        height_embeds = self.height_embedding(height)
        height_embeds = torch.flatten(height_embeds, start_dim=2)
        # print(height_embeds.size())
        
        #concat with feature vector
        x_and_height = torch.cat((x, height_embeds), 2)
        
        #send through LSTM
        lstm_out, _ = self.LSTM(x_and_height)
        # combine backward and forward LSTM outputs for each cell
        if(self.BILSTM):
            lstm_out = lstm_out[:,:,:self.proj_size] + lstm_out[:,:,self.proj_size:]
        # combinedLSTMOut = combinedLSTMOut / 2
        
        # # softmax but check for batch first
        # softmax_dim = 0
        # if self.batch_first:
        #     softmax_dim = 1
            
            
        # flatten seq out
        lstm_out = torch.flatten(lstm_out, start_dim=1)
        
        # #normalization
        # out = torch.nn.functional.log_softmax(nn_out, dim=softmax_dim)
        
        # apply ReLU
        relu_out = self.relu(lstm_out)
        
        # apply linear layer for base prediction
        nn_out = self.linearCap(relu_out)
        
        # return both the nn_out and the lstm out for loss calculations
        return nn_out, relu_out
    
    def generic_model_step(self, batch, batch_idx, str_of_step_name):
        
         #### #### #### WARNING MAY CAUSE SOME WEIRD OBJECT ORIENTED RELATED BEHAVIOUR I AM UNAWARE ABOUT AND NOT WORK #### #### ####
            
        base_pred, vol_pred = self(batch['x'], batch['height_vector'])
        loss_1 = self.loss_fn_vol(vol_pred, batch['cloud_volume_target'])
        loss_2 = self.loss_fn_base(base_pred, batch['cloud_base_target'])
        loss = (loss_1*40) + loss_2
        
        #log to tensorboard
        self.log(str_of_step_name, 'loss', loss)
        self.log(str_of_step_name, 'volume loss component', loss_1)
        self.log(str_of_step_name, 'base height loss component', loss_2)
        
        return loss
        
    
    
    def training_step(self, batch, batch_idx):
        
        return self.generic_model_step(batch, batch_idx, 'training')
    
    def validation_step(self, batch, batch_idx):
        
        return self.generic_model_step(batch, batch_idx, 'validation')
    
    def test_step(self, batch, batch_idx):
        
        return self.generic_model_step(batch, batch_idx, 'test')
    
    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), self.lr)
        
        return optim

# define torch dataloader
class CBH_Dataset(torch.utils.data.Dataset):
    def __init__(self, data_x, data_y, cloud_base_label):
        print('begin init')
        
        self.temp_humidity_pressure = data_x
        self.cloudbase_target = data_y
        self.cbh_label = cloud_base_label
        
        self.height_layer_number = data_x.shape[1] # take the shape at index 1 as data_x of format sample, height, feature
        
        assert self.height_layer_number == 70
        
        # legacy cloud base height
        # self.height_encoding = torch.from_numpy(data_height)
        print('end init')
        
    def __len__(self):
        return len(self.temp_humidity_pressure)

    def __getitem__(self, idx):
        
        # since dask is being used, first compute the values on the index given to the get function, convert the array to tensor for pytorch
        
        input_features = torch.from_numpy(self.temp_humidity_pressure[idx].compute())
        output_target = torch.from_numpy(self.cloudbase_target[idx].compute())
        output_target = output_target.type(torch.FloatTensor)
        cbh_lab = torch.from_numpy(self.cbh_label[idx].compute())
        
        print('CALL ON GETITEM')
        
        height_vec = torch.from_numpy(np.arange(self.height_layer_number)) # should have produced this vector here, as it is the same every time, but will leave it since sunken cost and maybe it improves performance??? 
        
        item_in_dataset = {'x':input_features, 'cloud_volume_target':output_target, 'cloud_base_target':cbh_lab, 'height_vector':height_vec}
        return item_in_dataset
    

In [7]:
# define dask specific collate function for dataloader, collate is the step where the dataloader combines all the samples into a singular batch to be enumerated on, 
# after getting all items 

def dataloader_collate_with_dask(batch):
    
    assert torch.utils.data.get_worker_info() is None # if this assertion fails, there are issues in code and this case needs to be handled see pytorch source of default collate fn
    print(batch)
    return 1
    
    elem = batch[0]
    elem_type = type(elem)
    if isinstance(elem, torch.Tensor):
        out = None
        if torch.utils.data.get_worker_info() is not None:
            # If we're in a background process, concatenate directly into a
            # shared memory tensor to avoid an extra copy
            numel = sum(x.numel() for x in batch)
            storage = elem.storage()._new_shared(numel, device=elem.device)
            out = elem.new(storage).resize_(len(batch), *list(elem.size()))
        return torch.stack(batch, 0, out=out)
    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
            and elem_type.__name__ != 'string_':
        if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
            # array of string classes and object
            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
                raise TypeError(default_collate_err_msg_format.format(elem.dtype))

            return default_collate([torch.as_tensor(b) for b in batch])
        elif elem.shape == ():  # scalars
            return torch.as_tensor(batch)
    elif isinstance(elem, collections.abc.Mapping):
        try:
            return elem_type({key: default_collate([d[key] for d in batch]) for key in elem})
        except TypeError:
            # The mapping type may not support `__init__(iterable)`.
            return {key: default_collate([d[key] for d in batch]) for key in elem}
    elif isinstance(elem, collections.abc.Sequence):
        # check to make sure that the elements in batch have consistent size
        it = iter(batch)
        elem_size = len(next(it))
        if not all(len(elem) == elem_size for elem in it):
            raise RuntimeError('each element in list of batch should be of equal size')
        transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.

        if isinstance(elem, tuple):
            return [default_collate(samples) for samples in transposed]  # Backwards compatibility.
        else:
            try:
                return elem_type([default_collate(samples) for samples in transposed])
            except TypeError:
                # The sequence type may not support `__init__(iterable)` (e.g., `range`).
                return [default_collate(samples) for samples in transposed]

    raise TypeError(default_collate_err_msg_format.format(elem_type))
    

In [8]:
# enforce reproducibility
from pytorch_lightning import Trainer, seed_everything

seed_everything(42)


Global seed set to 42


42

## Perform the network initialization and training

In [11]:
# load into torcg dataset 

collate_fn = dataloader_collate_with_dask

train_cbh_data = CBH_Dataset(train_input, train_cloud_volume, train_labels)
dev_cbh_data = CBH_Dataset(dev_input, dev_cloud_volume, dev_labels)

height_dim = train_input.shape[1]

# define model and hyperparameters
layers = 3
input_size = train_input.shape[2] # input size is the cell input (feat dim)
output_size = 1 # for each height layer, predict one value for cloud base prob
hidden_size = 32
embed_size = 5
BILSTM = False
batch_first = True

learn_rate = 0.002

log_with_pl = False # do not log, as track with mlFlow

model = CloudBaseLSTM(input_size, layers, hidden_size, output_size, height_dim, embed_size, BILSTM, batch_first, lr=learn_rate, log_boolean=log_with_pl)

# define training related hyperparameters

epochs = 10
max_time ="00:12:00:00" #dd:hh:mm:ss

batch_size = 10

# after training parameters defined, load datasets into dataloaders
train_loader = torch.utils.data.DataLoader(train_cbh_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(dev_cbh_data, batch_size=batch_size, collate_fn=collate_fn) # don't shuffle in val

# define trainer

# 
trainer = pl.Trainer(max_epochs = epochs, deterministic=True, check_val_every_n_epoch=1, devices="auto", accelerator="auto", max_time=max_time)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


begin init
end init
begin init
end init


In [12]:
# setup mlflow logging
import mlflow.pytorch
from mlflow.tracking import MlflowClient

mlflow.pytorch.autolog()

In [14]:
# run the training function 
with mlflow.start_run() as run:
    trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

Missing logger folder: /net/home/h02/hsouth/github_committing/data_science_cop/challenges/2021_CyrilMorcrette_cloudBaseHeight/lightning_logs

  | Name             | Type             | Params
------------------------------------------------------
0 | LSTM             | LSTM             | 2.5 K 
1 | linearCap        | Linear           | 5.0 K 
2 | relu             | ReLU             | 0     
3 | height_embedding | Embedding        | 350   
4 | loss_fn_vol      | MSELoss          | 0     
5 | loss_fn_base     | CrossEntropyLoss | 0     
------------------------------------------------------
7.8 K     Trainable params
0         Non-trainable params
7.8 K     Total params
0.031     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


CALL ON GETITEM
CALL ON GETITEM
CALL ON GETITEM
CALL ON GETITEM
CALL ON GETITEM
CALL ON GETITEM
CALL ON GETITEM


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Display and evaluate results

In [None]:
# display mlflow output
print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))

In [None]:
# sample some predictions
#have a look at model predictions for a sample after training
# {'x':input_features, 'cloud_volume_target':output_target, 'cloud_base_target':cbh_lab, 'height_vector':height_vec}

sample = cbh_data[1:2]

modelOutBase, moVol = model(sample['x'], sample['height_vector'])
print(modelOutBase)
print(sample['cloud_base_target'])
print('')
print(moVol)
print(sample['cloud_volume_target'])


# predictionInit = torch.zeros(modelOut.size(1))
# predictionInit[torch.argmax(modelOut,dim=1)] = 1
# print(predictionInit)