# Conversion of Data to a Machine Learning Friendly Format
This notebook demonstrates taking a single NetCDF file and converting the file into analysis ready numpy arrays stored in a zarr file for later use in neural network training.

Specifically, after having loaded multiple NetCDF files from UM model data into a single Iris CubeList, and saving this CubeList to disk, this notebook will:<ul>
<li>Load the single NetCDF file back from disk.</li>
<li>Extract the desired cubes: cloud volume fraction, specific humidity, air pressure, and air temperature.</li>
<li>Combine cubes of the same feature where metadata differences have prevented concatenation.</li>
<li>Convert the cubes to numpy arrays.</li>
<li>Format the arrays into a desirable dimension: (Sample Number, Height Level, Feature).</li>
<li>Generate data for the desired target we want to make a prediction on (cloud base height at a level in a sample).</li>
<li>Normalize data where necessary.</li>
<li>Save the data to disk for later loading to perform ML tasks.</li></ul> 

In [5]:
# define imports
import os
import pathlib
import re
import iris
import xarray
import dask

import numpy as np

In [6]:
# define file paths
paths_to_load = os.environ['SCRATCH'] + "/cbh_data/dev/dev_large.nc" # one large nc file of iris' concatenation of all small nc files
path_to_save_result = os.environ['SCRATCH'] + "/cbh_data/analysis_ready/example_dev.npz" # ouput for numpy arrays
path_to_save_zarr = os.environ['SCRATCH'] + "/cbh_data/analysis_ready/example_dev.zarr" # output for zarr files

## Loading in the Cloud Base Height Data

In [7]:

cubes = iris.load(paths_to_load)
    
print('Find files complete, list of paths:', paths_to_load)

Find files complete, list of paths: /scratch/hsouth/cbh_data/dev/dev_large.nc


In [4]:
#show cubes
cubes

M01S05I250 (unknown),model_level_number,latitude,longitude
Shape,70,480,640
Dimension coordinates,,,
model_level_number,x,-,-
latitude,-,x,-
longitude,-,-,x
Auxiliary coordinates,,,
level_height,x,-,-
sigma,x,-,-
Scalar coordinates,,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00,0:00

Cloud Volume Fraction In Atmosphere Layer (1),model_level_number,latitude,longitude
Shape,70,480,640
Dimension coordinates,,,
model_level_number,x,-,-
latitude,-,x,-
longitude,-,-,x
Auxiliary coordinates,,,
level_height,x,-,-
sigma,x,-,-
Scalar coordinates,,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00,0:00

Air Pressure (Pa),model_level_number,latitude,longitude
Shape,70,480,640
Dimension coordinates,,,
model_level_number,x,-,-
latitude,-,x,-
longitude,-,-,x
Auxiliary coordinates,,,
level_height,x,-,-
sigma,x,-,-
Scalar coordinates,,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00,0:00

Air Temperature (K),model_level_number,latitude,longitude
Shape,70,480,640
Dimension coordinates,,,
model_level_number,x,-,-
latitude,-,x,-
longitude,-,-,x
Auxiliary coordinates,,,
level_height,x,-,-
sigma,x,-,-
Scalar coordinates,,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00,0:00

Convective Rainfall Flux (kg m-2 s-1),latitude,longitude
Shape,480,640
Dimension coordinates,,
latitude,x,-
longitude,-,x
Scalar coordinates,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00
time 2017-07-01 12,0:00,0:00
Attributes,,Conventions CF-1.7 STASH m01s05i205 source Data from Met Office Unified Model um_version 10.9

Convective Snowfall Flux (kg m-2 s-1),latitude,longitude
Shape,480,640
Dimension coordinates,,
latitude,x,-
longitude,-,x
Scalar coordinates,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00
time 2017-07-01 12,0:00,0:00
Attributes,,Conventions CF-1.7 STASH m01s05i206 source Data from Met Office Unified Model um_version 10.9

Specific Humidity (kg kg-1),model_level_number,latitude,longitude
Shape,70,480,640
Dimension coordinates,,,
model_level_number,x,-,-
latitude,-,x,-
longitude,-,-,x
Auxiliary coordinates,,,
level_height,x,-,-
sigma,x,-,-
Scalar coordinates,,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00,0:00

Stratiform Rainfall Flux (kg m-2 s-1),latitude,longitude
Shape,480,640
Dimension coordinates,,
latitude,x,-
longitude,-,x
Scalar coordinates,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00
time 2017-07-01 12,0:00,0:00
Attributes,,Conventions CF-1.7 STASH m01s04i203 source Data from Met Office Unified Model um_version 10.9

Stratiform Snowfall Flux (kg m-2 s-1),latitude,longitude
Shape,480,640
Dimension coordinates,,
latitude,x,-
longitude,-,x
Scalar coordinates,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00
time 2017-07-01 12,0:00,0:00
Attributes,,Conventions CF-1.7 STASH m01s04i204 source Data from Met Office Unified Model um_version 10.9

Upward Air Velocity (m s-1),model_level_number,latitude,longitude
Shape,70,480,640
Dimension coordinates,,,
model_level_number,x,-,-
latitude,-,x,-
longitude,-,-,x
Auxiliary coordinates,,,
level_height,x,-,-
sigma,x,-,-
Scalar coordinates,,,forecast_period 12.0 hours
forecast_reference_time 2017-07-01 00,0:00,0:00,0:00


## Preprocess the data

### Extract the desired cubes: cloud volume fraction, specific humidity, air pressure, and air temperature

In [5]:
# extract data we want for the task, temperature, pressure, and humidity for inputs, 
# and cloud volume for outputs
def create_dataset(cubes):
    list_of_input_cubes = [ 'air_temperature',
                             'air_pressure',
                             'specific_humidity' ]
    target_cube_name = ['cloud_volume_fraction_in_atmosphere_layer']

    target_cube = iris.cube.CubeList([cube for cube in cubes if (cube.long_name in target_cube_name)])
    inp_cube = iris.cube.CubeList([cube for cube in cubes if (cube.standard_name in list_of_input_cubes)])
    
    
    return inp_cube, target_cube

inp_cube, tar_cube = create_dataset(cubes)

# verify success
print("input cube:\n",inp_cube, '\n')
print("target cubes:\n",tar_cube)

input cube:
 0: air_pressure / (Pa)                 (model_level_number: 70; latitude: 480; longitude: 640)
1: air_temperature / (K)               (model_level_number: 70; latitude: 480; longitude: 640)
2: specific_humidity / (kg kg-1)       (model_level_number: 70; latitude: 480; longitude: 640) 

target cubes:
 0: cloud_volume_fraction_in_atmosphere_layer / (1) (model_level_number: 70; latitude: 480; longitude: 640)


### Combine cubes of the same feature where metadata differences have prevented concatenation, while also extracting the numpy array of each cube

In [6]:
#if duplicate cubes exist, concatenate them using numpy to avoid metadata matching issues
def concatenate_same_cubes(cube_list):
    
    cube_name_dictionary = {}
    
    for cube in cube_list:
        # print('start cube load')
        cube_np_array = cube.core_data()
        # print('end load')
        
        if not cube.long_name is None:
            cube_name = cube.long_name
        elif not cube.standard_name is None:
            cube_name = cube.standard_name
        else:
            raise Exception('No name found on cube')
            
        try:
            # concat along the differing axis, forcast reference time
            cube_name_dictionary[cube_name] = np.concatenate((cube_np_array, cube_name_dictionary[cube_name]), axis=1)
            
            # print(cube_name_dictionary[cube_name].shape)
            
        except KeyError:
            cube_name_dictionary[cube_name] = cube_np_array
            
    return cube_name_dictionary



inp_dict = concatenate_same_cubes(inp_cube)
tar_dict = concatenate_same_cubes(tar_cube)


# verify some concats
print("Air Pressure array shape:", inp_dict['air_pressure'].shape)
print("Cloud Volume array shape:", tar_dict['cloud_volume_fraction_in_atmosphere_layer'].shape)
print("Array types:", type(inp_dict['air_pressure']))

Air Pressure array shape: (70, 480, 640)
Cloud Volume array shape: (70, 480, 640)
Array types: <class 'dask.array.core.Array'>


In [7]:
# combine dictionary elements to one array
def combine_feats(dict_of_feats):
    
    add_dim_for_feature = [np.expand_dims(x, axis=0) for x in dict_of_feats.values()]
    feat_concat_array = np.concatenate(add_dim_for_feature, axis=0)
    return feat_concat_array

inp_array = combine_feats(inp_dict)
tar_array = combine_feats(tar_dict)

# verify and check dims
print("Dimensions to standardize for processing:")
print("Current Input Shape:", inp_array.shape)
print("Current Target Shape:", tar_array.shape)

Dimensions to standardize for processing:
Current Input Shape: (3, 70, 480, 640)
Current Target Shape: (1, 70, 480, 640)


In [8]:
# expand the dimensions of "short" arrays to work in flattening
if(len(inp_array.shape) == 4):
    time_time2_dims_to_add = [1,2]
    inp_array = np.expand_dims(inp_array, time_time2_dims_to_add)
    tar_array = np.expand_dims(tar_array, time_time2_dims_to_add)
    print("New and correct shapes (should be 6 dims):")
    print(inp_array.shape)
    print(tar_array.shape)

New and correct shapes (should be 6 dims):
(3, 1, 1, 70, 480, 640)
(1, 1, 1, 70, 480, 640)


In [9]:
print("Show array storage metadata:")
inp_array

Show array storage metadata:


Unnamed: 0,Array,Chunk
Bytes,246.09 MiB,82.03 MiB
Shape,"(3, 1, 1, 70, 480, 640)","(1, 1, 1, 70, 480, 640)"
Count,15 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 246.09 MiB 82.03 MiB Shape (3, 1, 1, 70, 480, 640) (1, 1, 1, 70, 480, 640) Count 15 Tasks 3 Chunks Type float32 numpy.ndarray",1  1  3  640  480  70,

Unnamed: 0,Array,Chunk
Bytes,246.09 MiB,82.03 MiB
Shape,"(3, 1, 1, 70, 480, 640)","(1, 1, 1, 70, 480, 640)"
Count,15 Tasks,3 Chunks
Type,float32,numpy.ndarray


### Flatten the arrays

In [10]:
# Flatten time and lat/long down to sample number
# defining function as preprocessing is applied to both input and target
# expects 6d array where each expected dimension is named in the function - cube_num, time, time2, height, lat, long
def flatten_cubes_with_numpy(np_array):
    
    # print("input dimensions:", np_array.shape)
    
    cube_num, time, time2, height, lat, long = np_array.shape
    
    # # verify shape
    # print(np_array.shape)
    
    # swap axis of time and height to ensure flattening preserves height
    cube_array = np_array.transpose(0,3,1,2,4,5)
    cubes_flattened = np.reshape(cube_array, (cube_num, height,(lat*long*time*time2)))
    
    # print("new dimensions", cubes_flattened.shape)
    
    cube_to_return = cubes_flattened.T
    # remove unnecessary dimensions
    cube_to_return = cube_to_return.squeeze()
    return cube_to_return

# allow the large chunk of data
dask.config.set({"array.slicing.split_large_chunks": False})

inp_array = flatten_cubes_with_numpy(inp_array)
tar_array = flatten_cubes_with_numpy(tar_array)

# print("verify squeeze")
print("Shapes of flattened and transposed arrays:")
print("Input:", inp_array.shape)
print("Target:", tar_array.shape)


Shapes of flattened and transposed arrays:
Input: (307200, 70, 3)
Target: (307200, 70)


In [11]:
#rechunk large data to ensure large chunks are reduced for easier handling in dask
tar_array = dask.array.rechunk(tar_array, chunks='auto')
print("Rechunked array storage metadata for target:")
tar_array

Rechunked array storage metadata for target:


Unnamed: 0,Array,Chunk
Bytes,82.03 MiB,82.03 MiB
Shape,"(307200, 70)","(307200, 70)"
Count,8 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 82.03 MiB 82.03 MiB Shape (307200, 70) (307200, 70) Count 8 Tasks 1 Chunks Type float32 numpy.ndarray",70  307200,

Unnamed: 0,Array,Chunk
Bytes,82.03 MiB,82.03 MiB
Shape,"(307200, 70)","(307200, 70)"
Count,8 Tasks,1 Chunks
Type,float32,numpy.ndarray


In [12]:
inp_array = dask.array.rechunk(inp_array, chunks='auto')
print("Rechunked array storage metadata for input:")
inp_array

Rechunked array storage metadata for input:


Unnamed: 0,Array,Chunk
Bytes,246.09 MiB,82.03 MiB
Shape,"(307200, 70, 3)","(307200, 70, 1)"
Count,24 Tasks,3 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 246.09 MiB 82.03 MiB Shape (307200, 70, 3) (307200, 70, 1) Count 24 Tasks 3 Chunks Type float32 numpy.ndarray",3  70  307200,

Unnamed: 0,Array,Chunk
Bytes,246.09 MiB,82.03 MiB
Shape,"(307200, 70, 3)","(307200, 70, 1)"
Count,24 Tasks,3 Chunks
Type,float32,numpy.ndarray


## Preprocess the data toward ML algorithm input

### Generate data for the target of cloud base at certain height

In [13]:
# preprocess the target
# for the target, we define a cloud exisitng in a height layer:
# if the cloud volume fraction is greater than 2 out of possible 8 oktas
cloud_threshold = 2./8.
# find the first occurences where the cloud volume is greater than the threshold, 
# stores 0 otherwise
cloud_over_threshold = dask.array.where(tar_array>cloud_threshold)

In [14]:
%%time
# realize the size of the where condition
print("Start base found sample compute")
sample_with_cloud = cloud_over_threshold[0].compute()
print("Start sample index compute")
index_on_sample = cloud_over_threshold[1].compute()

Start base found sample compute
Start sample index compute
CPU times: user 475 ms, sys: 232 ms, total: 707 ms
Wall time: 657 ms


In [15]:
%%time
# remove repeat indicies
_, first_duplicate_indicies = np.unique(sample_with_cloud, return_index=True)
# print("Start duplicate indicies compute")
# first_duplicate_indicies = first_duplicate_indicies.compute()
# print("Number of cloud bases found:",first_duplicate_indicies.shape)
# print("Out of samples:", tar_array.shape[0])

CPU times: user 30.6 ms, sys: 24 ms, total: 54.6 ms
Wall time: 52.3 ms


In [16]:
%%time
#for clouds where no base was found, add a marker at the final height layer 
# (where no cloud volume over threshold appears in the data)

# verify the claim that no cloud bases appear in the final layer
# can be strengthened to, no clouds exist in the final layer (next line returns 0)
# print("list of clouds at final height level:", np.where(tar_array[:,-1]>cloud_threshold))

# encode the cloud in onehot vector
one_hot_encoded_bases = np.zeros(tar_array.shape)
one_hot_encoded_bases[sample_with_cloud[first_duplicate_indicies],index_on_sample[first_duplicate_indicies]] = 1
# mark the end (final layer) if no cloud base detected
flip = lambda booleanVal: not booleanVal
vflip = np.vectorize(flip)
one_hot_encoded_bases[np.where(vflip(np.any(one_hot_encoded_bases, axis=1)))[0], -1] = 1

# Now reduce vectors as if each height layer is treated as a class where the model will predict, onehot -> class label e.g. 0,0,1,0, -> 2
class_label_encoded_bases = np.argmax(one_hot_encoded_bases, axis=1)

CPU times: user 128 ms, sys: 26 ms, total: 154 ms
Wall time: 151 ms


In [17]:
print("Target as class label:", class_label_encoded_bases.shape)
print("Output dim:", one_hot_encoded_bases.shape)

Target as class label: (307200,)
Output dim: (307200, 70)


In [18]:
# # optionally, free up some memory

# del sample_with_cloud
# del cloud_over_threshold
# del first_duplicate_indicies
# del index_on_sample
# del tar_dict
# del tar_cube
# del cubes

### Normalize input data

In [19]:
%%time
# preprocess the inputs
# normalize variables (want access to unique features at top-level for processing access,
# so transpose, process, and transpose back)
# print("current array type:", type(inp_array))

inp_array = inp_array.T
inp_array = (inp_array - np.min(inp_array, axis=(1,2)).reshape((3,1,1))) / (np.ptp(inp_array, axis=(1,2)).reshape((3,1,1)))
inp_array = inp_array.T

# # commented a 2 half compute used to avoid memory constraints, some more code that was used to recombine is not included
# half = int(len(inp_array) / 2)
# inp_array_1 = inp_array[:half].compute()
# len_first_half = inp_array_1.shape[0]

inp_array = inp_array.compute()
print('Finished compute of input array normalization')
# convert to regular array, after verifying mask does not identify any values
# (print below gives 0 masked values)
num_of_masked = np.ma.count_masked(inp_array)
print("Number of masked values after computation:", num_of_masked)
assert num_of_masked == 0
# unmask, giving all masked values NaN (but no masked values)
inp_array = np.ma.filled(inp_array, np.nan)

# # verify dimensions
# print(inp_array.shape)
# # and verify type
# print("type of unmasked array:", type(inp_array))

Finished compute of input array normalization
Number of masked values after computation: 0
CPU times: user 1.41 s, sys: 1.32 s, total: 2.73 s
Wall time: 1.81 s


In [20]:
# # second half of commented memory constraint compute, see above cell
# %%time
# inp_array_2 = inp_array[half:].compute()
# print("Array type after compute:", type(inp_array_2))
# num_of_masked = np.ma.count_masked(inp_array_2)
# print("Count of masked (unfilled) values:", num_of_masked)
# assert num_of_masked == 0
# inp_array_2 = np.ma.filled(inp_array_2, np.nan)
# print("Array type after compute:", type(inp_array_2))
# len_second_half = inp_array_2.shape[0]

# # verify 
# print(len_second_half + len_first_half)
# print(inp_array.shape)

In [21]:
# compute and unmask target array (cloud volume)
print("Current type of target array:", type(tar_array))
print("Target shape:", tar_array.shape)
tar_array = tar_array.compute()
print("Finished compute of target array")

num_of_masked = np.ma.count_masked(tar_array)
print("Number of masked values after computation:", num_of_masked)
assert num_of_masked == 0

#unmask
tar_array = np.ma.filled(tar_array, np.nan)

Current type of target array: <class 'dask.array.core.Array'>
Target shape: (307200, 70)
Finished compute of target array
Number of masked values after computation: 0


#### View the produced arrays which are ready to be saved

In [22]:
inp_array

array([[[6.5583849e-01, 4.3776825e-01, 1.3042120e-03],
        [6.5355343e-01, 4.4206008e-01, 1.5034666e-03],
        [6.5037358e-01, 4.4349071e-01, 1.6778144e-03],
        ...,
        [3.2558051e-05, 6.8311876e-01, 6.3399195e-05],
        [1.0852683e-05, 4.9785408e-01, 6.1134939e-05],
        [1.2058537e-06, 1.6380544e-01, 5.6606423e-05]],

       [[6.5592051e-01, 4.3776825e-01, 1.3042120e-03],
        [6.5362817e-01, 4.4563663e-01, 1.5193163e-03],
        [6.5044475e-01, 4.5064378e-01, 1.7615919e-03],
        ...,
        [3.2558051e-05, 6.8311876e-01, 6.3399195e-05],
        [1.0852683e-05, 4.9785408e-01, 6.1134939e-05],
        [1.2058537e-06, 1.6452074e-01, 5.6606423e-05]],

       [[6.5598440e-01, 4.3776825e-01, 1.3200617e-03],
        [6.5368849e-01, 4.4349071e-01, 1.4898811e-03],
        [6.5050262e-01, 4.5064378e-01, 1.7615919e-03],
        ...,
        [3.2558051e-05, 6.8311876e-01, 6.3399195e-05],
        [1.0852683e-05, 4.9785408e-01, 6.1134939e-05],
        [1.2058537e-06

In [23]:
class_label_encoded_bases

array([16, 15, 15, ...,  0,  0,  0])

In [24]:
tar_array

array([[0.140625, 0.125   , 0.09375 , ..., 0.      , 0.      , 0.      ],
       [0.140625, 0.109375, 0.09375 , ..., 0.      , 0.      , 0.      ],
       [0.140625, 0.125   , 0.09375 , ..., 0.      , 0.      , 0.      ],
       ...,
       [0.90625 , 1.      , 1.      , ..., 0.      , 0.      , 0.      ],
       [1.      , 1.      , 1.      , ..., 0.      , 0.      , 0.      ],
       [0.953125, 1.      , 1.      , ..., 0.      , 0.      , 0.      ]],
      dtype=float32)

In [25]:
one_hot_encoded_bases

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

#### Save a selection of wanted arrays (inp_array, tar_array, one_hot_encoded_bases)

In [26]:
# verify input and output shapes
print("Input dim:", inp_array.shape)
print("Cloud Output dim:", tar_array.shape)

Input dim: (307200, 70, 3)
Cloud Output dim: (307200, 70)


In [1]:
# save the computed array 
# (will not save one of class label output or one_hot as easy conversion between the two)
# (went with saving one-hot to emulate the data produced/used by base solution)

In [28]:
%%time
print("Saving numpy arrays")
f = open(path_to_save_result, "w+b")

#variable assignment that name the arrays for the saved file
input_x=inp_array
output_onehot=one_hot_encoded_bases
output_cloud_volume=tar_array

np.savez(f, input_x=input_x, output_cloud_volume=output_cloud_volume, output_onehot=output_onehot)
f.close()
print("Save Finished")

Saving numpy arrays
Save Finished
CPU times: user 745 ms, sys: 142 ms, total: 887 ms
Wall time: 1.26 s


The following cell is code to create a positional encoding for the height layers in the data, e.g. the data at height layer 0 would have the positional encoding of: 0 as part of the input feature. It is commented out as PyTorch Dataloaders are found to have the capability to produce this information at load-time, which seems like a better option than creating a potentially huge array for each position that is scaled up to the size of the sample number redundantly.

In [29]:
# # create an extra positional encoding optionally for input use

# sample_num, height_dim, _ = inp_array.shape
# # generate height values
# height_position_vector = np.arange(height_dim)
# # extend dimensions out to match input feats
# height_position_vector = np.repeat([height_position_vector], sample_num, axis=0)

# # # verify
# # print(height_position_vector.shape)

# x,y = height_position_vector.shape
# # add a dimension for height to act as a feature
# height_position_vector = height_position_vector.reshape(x,y,1)

# # fit the dtype of the feature to match the dtype of other feats
# height_position_vector = height_position_vector.astype(inp_array.dtype)

# # # combine height feature into input array 
# # inp_array = np.concatenate((height_position_vector, inp_array), axis=2, dtype=np.float32) #leave the concat for within the model after producing embedding

# # verify datatypes
# print("input dtype", inp_array.dtype)
# print("height encoding dtype", height_position_vector.dtype)

## Convert saved numpy arrays to zarr files

In [None]:
import zarr # (package not in cop torch conda env)

# open the numpy files and load each array into a variable

f = open(path_to_save_result, "r+b")
numpy_files = np.load(f)
inp_arr = numpy_files['input_x']
onehot_arr = numpy_files['output_onehot']
tar_arr = numpy_files['output_cloud_volume']
f.close() 


In [None]:
store = zarr.DirectoryStore(path_to_save_zarr)

#define objected for arrays to be grouped under

zarr_grouping = zarr.group(store=store, overwrite=True)

# initialize and then write on zarr arrays for all desired arrays to be saved

humidity_temp_pressure_x =  zarr_grouping.zeros(shape=inp_arr.shape, dtype=inp_arr.dtype, name='humidity_temp_pressure_x.zarr')
humidity_temp_pressure_x[:] = inp_arr

onehot_cloud_base_height_y = zarr_grouping.zeros(shape=onehot_arr.shape, dtype=onehot_arr.dtype, name='onehot_cloud_base_height_y.zarr')
onehot_cloud_base_height_y[:] = onehot_arr

cloud_volume_fraction_y = zarr_grouping.zeros(shape=tar_arr.shape, dtype=tar_arr.dtype, name='cloud_volume_fraction_y.zarr')
cloud_volume_fraction_y[:] = tar_arr

# close the file store after writing is finished

store.close()