# Exploring transformed data

This notebook explores and modifies data, after it has been transformed into a desired format.

The exploration will help understanding of the data, and allow for any further changes that will be required for good practice in machine learning.

In [1]:
# imports

#file handling
import os
import pathlib
import sys

from pytorch_lightning.loggers import MLFlowLogger

import dask
import dask.array

# math operators
import numpy as np
import pytorch_lightning as pl

# ml
import torch
import zarr

import datetime
from tempfile import TemporaryDirectory

# training helpers
import mlflow.pytorch
from dask.diagnostics import CacheProfiler, Profiler, ResourceProfiler, visualize
from mlflow.tracking import MlflowClient
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import (
    RichProgressBar,
)  # this progress bar works through jupyterHub on spice

# defined in directory (model related definitions)
import cbh_data_definitions
import cbh_torch_lstm
import cbh_torch_MLP

print("pl ver:", pl.__version__)
print("mlflow ver:", mlflow.__version__)
print("torch ver:", torch.__version__)
print("Python ver:", sys.version_info)

pl ver: 1.7.7
mlflow ver: 1.30.0
torch ver: 1.12.1
Python ver: sys.version_info(major=3, minor=10, micro=6, releaselevel='final', serial=0)


In [2]:
RELOAD_PACKAGES = True
if RELOAD_PACKAGES:
    import importlib
    importlib.reload(cbh_torch_lstm)
    importlib.reload(cbh_torch_MLP)
    importlib.reload(cbh_data_definitions)

In [3]:
root_data_directory = pathlib.Path(os.environ["SCRATCH"]) / "cbh_data"

dev_data_path = root_data_directory / "analysis_ready" / "dev.zarr"
training_data_path = root_data_directory / "analysis_ready" / "train.zarr"


In [4]:
(
    train_input,
    train_labels,
    _,
) = cbh_data_definitions.load_data_from_zarr(training_data_path)

(
    dev_input, 
    dev_labels, 
    _
) = cbh_data_definitions.load_data_from_zarr(dev_data_path)

# the cloud volume is not needed for the task, so isn't saved on the load
#show a chunk, used to inform dask cache size
train_input

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 3
No. arrays        : 3
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, cloud_volume_fraction_y.zarr,
                  : humidity_temp_pressure_x.zarr
 

Loaded zarr, file information:
 Name              : /
Type              : zarr.hierarchy.Group
Read-only         : False
Synchronizer type : zarr.sync.ThreadSynchronizer
Store type        : zarr.storage.DirectoryStore
No. members       : 3
No. arrays        : 3
No. groups        : 0
Arrays            : cloud_base_label_y.zarr, cloud_volume_fraction_y.zarr,
                  : humidity_temp_pressure_x.zarr
 



Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,373.24 MiB
Shape,"(111820800, 70, 3)","(465920, 70, 3)"
Count,2 Graph Layers,240 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 87.48 GiB 373.24 MiB Shape (111820800, 70, 3) (465920, 70, 3) Count 2 Graph Layers 240 Chunks Type float32 numpy.ndarray",3  70  111820800,

Unnamed: 0,Array,Chunk
Bytes,87.48 GiB,373.24 MiB
Shape,"(111820800, 70, 3)","(465920, 70, 3)"
Count,2 Graph Layers,240 Chunks
Type,float32,numpy.ndarray


In [5]:
display_data_info = False
if display_data_info:
    import matplotlib.pyplot as plt
    labels, counts = np.unique(train_labels, return_counts=True)
    print('ok')
    labels = labels.compute()
    print('done')
    counts = counts.compute()
    bins_targ = tuple(zip(list(labels), list(counts)))
    print("\nModel predictions binned: (Class label, Count):", bins_targ, "\n")
    print("Percentage of samples with cloud bases:", np.sum(counts[0:-1]) / (np.sum(counts[-1]) + np.sum(counts[0:-1])))
    fig, ax = plt.subplots(1,2,figsize=(10,10))
    len_samples = len(train_labels)
    ax[0].pie(counts, labels=labels)
    ax[1].pie([np.sum(counts[0:-1]), np.sum(counts[-1])], labels=[1,0])
    ax[0].legend(labels, fontsize=6)
    plt.show()

randomize data

In [6]:
np.random.seed(42)
print("Start permutation gen")
p = np.random.permutation(len(train_input))
p_dev = np.random.permutation(len(dev_input))
print("End permutation gen")

Start permutation gen
End permutation gen


In [7]:
from dask.diagnostics import ProgressBar, ResourceProfiler
#progressbar sits at 100% for a while thus time reporting is not correct
with ProgressBar():
    train_input = train_input.compute()
with ProgressBar():
    train_labels = train_labels.compute()

[########################################] | 100% Completed | 58.82 s
[########################################] | 100% Completed | 7.27 sms


In [9]:
train_inputp = train_input[p]
train_labelsp = train_labels[p]

In [10]:
train_input = train_inputp
train_labels = train_labelsp

### calculate good chunksize - 4gb per chunk as random chunks come about for one chunk at a time strategy, 4gb is reasonable to store in majority of cases
87.48GB = 111820800 samples
373.24MB = 465920 samples

approx 1250000 samples per GB, so desire 5000000 sample chunksize

5000000 in not a factor of the len however, so choose closest factor - and it might be worth considering the length of the validation set too

closest factor of train set: 5324800
closest factor of dev: (len) 307200

HOWEVER: the codec for chunk compression handles only a max of 2GB, so must go lower - max size turns out to be 2329600

In [11]:
print("train dev gcd:", np.gcd(len(train_input), len(dev_input)))
print("Factors of train: ", [n for n in range(1, len(train_input) + 1) if len(train_input) % n == 0])
print("Factors of dev: ", [n for n in range(1, len(dev_input) + 1) if len(dev_input) % n == 0])

train dev gcd: 307200
Factors of train:  [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, 20, 21, 24, 25, 26, 28, 30, 32, 35, 39, 40, 42, 48, 50, 52, 56, 60, 64, 65, 70, 75, 78, 80, 84, 91, 96, 100, 104, 105, 112, 120, 128, 130, 140, 150, 156, 160, 168, 175, 182, 192, 195, 200, 208, 210, 224, 240, 256, 260, 273, 280, 300, 312, 320, 325, 336, 350, 364, 384, 390, 400, 416, 420, 448, 455, 480, 512, 520, 525, 546, 560, 600, 624, 640, 650, 672, 700, 728, 768, 780, 800, 832, 840, 896, 910, 960, 975, 1024, 1040, 1050, 1092, 1120, 1200, 1248, 1280, 1300, 1344, 1365, 1400, 1456, 1536, 1560, 1600, 1664, 1680, 1792, 1820, 1920, 1950, 2048, 2080, 2100, 2184, 2240, 2275, 2400, 2496, 2560, 2600, 2688, 2730, 2800, 2912, 3072, 3120, 3200, 3328, 3360, 3584, 3640, 3840, 3900, 4096, 4160, 4200, 4368, 4480, 4550, 4800, 4992, 5120, 5200, 5376, 5460, 5600, 5824, 6144, 6240, 6400, 6656, 6720, 6825, 7168, 7280, 7680, 7800, 8192, 8320, 8400, 8736, 8960, 9100, 9600, 9984, 10240, 10400, 10752, 10920, 11200, 1164

In [12]:
import zarr
root_data_directory = pathlib.Path(os.environ["SCRATCH"]) / "cbh_data"

path_to_save_zarr_train = (
    root_data_directory / "analysis_ready" / "train_randomized.zarr"
)  # output for zarr files

store = zarr.DirectoryStore(path_to_save_zarr_train)
# define objected for arrays to be grouped under

zarr_grouping = zarr.group(store=store, overwrite=True)

# initialize and then write on zarr arrays for all desired arrays to be saved

cloud_base_label_y = zarr_grouping.zeros(
        shape=train_labels.shape,
        dtype=train_labels.dtype,
        name="cloud_base_label_y.zarr",
        chunks=(2329600),
    )
    
print("Start save 1")
cloud_base_label_y[:] = train_labels
print("End save 1")


humidity_temp_pressure_x = zarr_grouping.zeros(
        shape=train_input.shape,
        dtype=train_input.dtype,
        name="humidity_temp_pressure_x.zarr",
        chunks=(2329600, 70, 3),
    )
print("Start save 2")
humidity_temp_pressure_x[:] = train_input
print("End save 2")


Start save 1
End save 1
Start save 2
End save 2


In [13]:
del train_input
del train_inputp
del train_labelsp
del train_labels
dev_input = dev_input[p_dev].compute()
dev_labels = dev_labels[p_dev].compute()

  dev_input = dev_input[p_dev].compute()
  dev_labels = dev_labels[p_dev].compute()


In [14]:
import zarr
root_data_directory = pathlib.Path(os.environ["SCRATCH"]) / "cbh_data"

path_to_save_zarr_train = (
    root_data_directory / "analysis_ready" / "dev_randomized.zarr"
)  # output for zarr files

store = zarr.DirectoryStore(path_to_save_zarr_train)
# define objected for arrays to be grouped under

zarr_grouping = zarr.group(store=store, overwrite=True)

# initialize and then write on zarr arrays for all desired arrays to be saved

cloud_base_label_y = zarr_grouping.zeros(
        shape=dev_labels.shape,
        dtype=dev_labels.dtype,
        name="cloud_base_label_y.zarr",
        chunks=(len(dev_labels))
    )
    
print("Start save 1")
cloud_base_label_y[:] = dev_labels
print("End save 1")


humidity_temp_pressure_x = zarr_grouping.zeros(
        shape=dev_input.shape,
        dtype=dev_input.dtype,
        name="humidity_temp_pressure_x.zarr",
        chunks=(len(dev_input), 70, 3),
    )
print("Start save 2")
humidity_temp_pressure_x[:] = dev_input
print("End save 2")


Start save 1
End save 1
Start save 2
End save 2
