In [1]:
from pprint import pprint as print
import pandas as pd
import numpy as np
import zarr
import os

# Review: NumPy Arrays

In [2]:
a = np.zeros(shape=(10, 20), dtype='f8')
a.shape, a.dtype

((10, 20), dtype('float64'))

In [3]:
print("Hello World\n")

'Hello World\n'


This array lives in memory.

How much memory does the array use?

In [4]:
a.nbytes

1600

Getting a piece of data with slicing:

In [5]:
a[:2, :2]

array([[0., 0.],
       [0., 0.]])

Create a new array and assign to it:



In [6]:
b = np.ones(shape=(40, 50), dtype='f8')
b[:10, :20] = a
b

array([[0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [7]:
b[0,0], b[-1,-1]

(0.0, 1.0)

# Zarr Fundamentals

A zarr array has four important features:
- Shape
- Dtype
- Chunks
- Attributes

In [10]:
z = zarr.create(shape=(40, 50), chunks=(10, 10), dtype='f8', store='test.zarr') # data is divided into chunks and each chunk is compressed
# Store arguments is for 
z

<zarr.core.Array (40, 50) float64>

In [11]:
z.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(40, 50)"
Chunk shape,"(10, 10)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,16000 (15.6K)
No. bytes stored,337


In [12]:
# Assignment - Write data to array
z[:] = 1

In [13]:
z.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(40, 50)"
Chunk shape,"(10, 10)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,16000 (15.6K)
No. bytes stored,1277 (1.2K)


In [14]:
# Attributes assign metadata to the zarr arrays
z.attrs['units'] = 'm/s'

In [15]:
print(dict(z.attrs))

{'units': 'm/s'}


# Under the hood

Where/how is our data actually stored? 

The ability to look inside a Zarr store and understand what is there is a deliberate design decision.

In [16]:
z.store

<zarr.storage.DirectoryStore at 0x7f3c2c40f580>

In [17]:
!tree -a test.zarr

[01;34mtest.zarr[0m
├── .zarray
├── .zattrs
├── 0.0
├── 0.1
├── 0.2
├── 0.3
├── 0.4
├── 1.0
├── 1.1
├── 1.2
├── 1.3
├── 1.4
├── 2.0
├── 2.1
├── 2.2
├── 2.3
├── 2.4
├── 3.0
├── 3.1
├── 3.2
├── 3.3
└── 3.4

0 directories, 22 files


In [18]:
tmp = np.array([
    [
        [1, 2],
        [3, 4]
    ],
    [
        [5, 6],
        [7, 8]
    ],
    [
        [9, 10],
        [11, 12]
    ]
])
print(tmp.shape)
tmp[:,0,:]

(3, 2, 2)


array([[ 1,  2],
       [ 5,  6],
       [ 9, 10]])

In [19]:
tmp[:,:,0], tmp[:,:,1]

(array([[ 1,  3],
        [ 5,  7],
        [ 9, 11]]),
 array([[ 2,  4],
        [ 6,  8],
        [10, 12]]))

In [20]:
import json
with open('test.zarr/.zarray', 'r') as f:
    display(json.load(f))

{'chunks': [10, 10],
 'compressor': {'blocksize': 0,
  'clevel': 5,
  'cname': 'lz4',
  'id': 'blosc',
  'shuffle': 1},
 'dtype': '<f8',
 'fill_value': 0.0,
 'filters': None,
 'order': 'C',
 'shape': [40, 50],
 'zarr_format': 2}

In [21]:
import json
with open('test.zarr/.zattrs', 'r') as f:
    display(json.load(f))

{'units': 'm/s'}

# Choosing Chunks
The main parameter we control when creating Zarr Arrays is the chunk shape. When selecting chunks, we need to keep in mind two constraints:
- Writes can be concurrent (come from different processes simultaneously) if they do not touch the same chunks. _This enables massively parallel writing in the cloud._
- When reading the data, if any piece of the chunk is needed, the entire chunl has to be loaded.

Here we compare two different chunking strategies:

In [22]:
np.random.rand??

[0;31mDocstring:[0m
rand(d0, d1, ..., dn)

Random values in a given shape.

.. note::
    This is a convenience function for users porting code from Matlab,
    and wraps `random_sample`. That function takes a
    tuple to specify the size of the output, which is consistent with
    other NumPy functions like `numpy.zeros` and `numpy.ones`.

Create an array of the given shape and populate it with
random samples from a uniform distribution
over ``[0, 1)``.

Parameters
----------
d0, d1, ..., dn : int, optional
    The dimensions of the returned array, must be non-negative.
    If no argument is given a single Python float is returned.

Returns
-------
out : ndarray, shape ``(d0, d1, ..., dn)``
    Random values.

See Also
--------
random

Examples
--------
>>> np.random.rand(3,2)
array([[ 0.14022471,  0.96360618],  #random
       [ 0.37601032,  0.25528411],  #random
       [ 0.49313049,  0.94909878]]) #random
[0;31mType:[0m      builtin_function_or_method

In [23]:
c = zarr.create(shape=(200, 200, 200), chunks=(1, 200, 200), dtype='f8', store='c.zarr') # data chunked along first axix, contigous along other two
c[:] = np.random.rand(*c.shape)

In [24]:
%time _ = c[:,0,0] # Here we are reading the element from all the chunks along the first axis

CPU times: user 44.8 ms, sys: 72 ms, total: 117 ms
Wall time: 115 ms


In [25]:
d = zarr.create(shape=(200, 200, 200), chunks=(200, 200, 1), dtype='f8', store='d.zarr') # data chunked along third axix, contigous along other two
d[:] = np.random.rand(*d.shape)

In [26]:
%time _ = d[:,0,0] # Here we are reading the element from all the chunks along the third axis

CPU times: user 0 ns, sys: 3.73 ms, total: 3.73 ms
Wall time: 3.21 ms


In [27]:
!rm -rf tmp.zarr
tmp= zarr.create(shape=(100, 200, 300), chunks=(100, 100, 1), dtype='f8', store='tmp.zarr') # data chunked along third axix, contigous along other two
tmp[:] = np.random.rand(*tmp.shape)
print(len(os.listdir('tmp.zarr')))

601


There is no universally perfect chunk size / shape. Need to consider:
- Access patterns for data
- Latency & throughput of storage system
- Constrains on number of files / objects (don't want a billion files)

Rechunker Package: https://rechunker.readthedocs.io/en/latest/
- How do you transform chunks of two arrays from one to another.


Zarr is a data storage format. Xarray is an API and data model for computing.
- Zarr = fileformat
- Xarray = data analysis library

# Resize Arrays

In [28]:
c.resize(400, 200, 200)
c.info # Making array smaller discard the data

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(400, 200, 200)"
Chunk shape,"(1, 200, 200)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,128000000 (122.1M)
No. bytes stored,56045603 (53.4M)


# Compressors

A big part of the performance of Zarr is due to its support for compression of individual chunks. Zarr by default supports 20 different codecs. These live in seperate package called [numcodecs](https://numcodecs.readthedocs.io/en/stable/). The default compressor is the [Blosc](https://blosc.org/) meta-compressor. It's easy to add a new compressor or filter.

For the sake of time, we've decided to skip going into detail on compressor. You can read the [Zarr docs](https://zarr.readthedocs.io/en/stable/tutorial.html#compressors) for more information. The default compressor usually works well for most applications.

# Groups
To keep many arrays together, we can organize them into groups.

In [29]:
group = zarr.group(store='group.zarr')
group.create(name='foo', shape=(200, 200), chunks=(20, 20), dtype='f8')
group.create(name='bar', shape=(100, 100), chunks=(100, 10), dtype='i4')

<zarr.core.Array '/bar' (100, 100) int32>

In [30]:
group.info

0,1
Name,/
Type,zarr.hierarchy.Group
Read-only,False
Store type,zarr.storage.DirectoryStore
No. members,2
No. arrays,2
No. groups,0
Arrays,"bar, foo"


# Zarr in Cloud
## Writing to and Reading from Cloud Object Storage

Zarr can store data in any storage system that can be represented as a key-value store. Here are some examples:
- A dictioanry on your file system
- A ZipFile
- A [Redis](https://redis.io/) database
- A cloud object store (e.g. S3, GCS, Azure Blob Storage)

In the cell below, we will access an S3 bucket with read-write credentials. These credentials will be disabled after the workshop.

In [31]:
import uuid
storage_kwards = {
    "key": "",
    "secret": "",
}
my_folder = f"s3://my_bucket/{uuid.uuid4().hex}"

In [None]:
target = f"{my_folder}/my_array.zarr"
store = zarr.storage.ABSStore(target, **storage_kwards)

In [34]:
# group = zarr.group(store=store) # For Cloud storage
group = zarr.group(store='group.zarr') # For local storage
group.create(name='foo', shape=(200,200), chunks=(50,50), dtype='f8')
group.create(name='bar', shape=(100,100), chunks=(20,20), dtype='i4')
group

<zarr.hierarchy.Group '/'>

In [35]:
group.foo[:] = np.random.rand(*group.foo.shape)
group.foo.info

0,1
Name,/foo
Type,zarr.core.Array
Data type,float64
Shape,"(200, 200)"
Chunk shape,"(50, 50)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,320000 (312.5K)


# Consolidating Metadata
Listing directories can sometimes be slow (or impossible) on certain storage systems. Zarr offers the ability to consolidate metadata for an entire group into single object.

In [36]:
consolidated_metadata = zarr.consolidate_metadata(store = 'group.zarr')

with open('group.zarr/.zmetadata', 'r') as f:
    display(json.load(f))

{'metadata': {'.zgroup': {'zarr_format': 2},
  'bar/.zarray': {'chunks': [20, 20],
   'compressor': {'blocksize': 0,
    'clevel': 5,
    'cname': 'lz4',
    'id': 'blosc',
    'shuffle': 1},
   'dtype': '<i4',
   'fill_value': 0,
   'filters': None,
   'order': 'C',
   'shape': [100, 100],
   'zarr_format': 2},
  'foo/.zarray': {'chunks': [50, 50],
   'compressor': {'blocksize': 0,
    'clevel': 5,
    'cname': 'lz4',
    'id': 'blosc',
    'shuffle': 1},
   'dtype': '<f8',
   'fill_value': 0.0,
   'filters': None,
   'order': 'C',
   'shape': [200, 200],
   'zarr_format': 2}},
 'zarr_consolidated_format': 1}

# Zarr + Xarray (+ Dask)

Never used `zarr` library directly. Instead, always read and write Zarr via [Xarray](http://xarray.pydata.org/en/stable/). Xarray's data model is the [NetCDF](https://www.unidata.ucar.edu/software/netcdf/) data model. Xarray is built on top of Pandas and Dask.

Xarray is an open source project and Python package that introduces labels in form of dimensions, coordinates, and attributes on top of raw NumPy-like arrays, which allowd for a more intuitive, more concise, and less error-prone developer experience.

Xarray includes a large and growing library of domain-specific functions for advanced analytics and visualization with these data structures.

## Quick Review of Xarray

In [2]:
import xarray as xr
import hvplot.xarray
import dask

In [3]:
ds = xr.tutorial.open_dataset('air_temperature')
ds

In [11]:
ds.air.hvplot(x='lon', y='lat', cmap='magma')

BokehModel(combine_events=True, render_bundle={'docs_json': {'5b7941e7-6002-4fd8-a08e-5165ec3ec70f': {'version…

## Writing Zarr from Xarray
First we chunk the dataset. This accomplishes two things.
- Allows parallel processing using Dask (Not necessary for this small-data example but very useful for big data)
- Automatically maps Dask chunks to Zarr chunks when writing.

In [9]:
ds_chunked = ds.chunk({'time': 100})
ds_chunked

Unnamed: 0,Array,Chunk
Bytes,29.52 MiB,1.01 MiB
Shape,"(2920, 25, 53)","(100, 25, 53)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 29.52 MiB 1.01 MiB Shape (2920, 25, 53) (100, 25, 53) Dask graph 30 chunks in 2 graph layers Data type float64 numpy.ndarray",53  25  2920,

Unnamed: 0,Array,Chunk
Bytes,29.52 MiB,1.01 MiB
Shape,"(2920, 25, 53)","(100, 25, 53)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [6]:
from dask.diagnostics import ProgressBar

In [10]:
ds_chunked.to_zarr??

[0;31mSignature:[0m
[0mds_chunked[0m[0;34m.[0m[0mto_zarr[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mstore[0m[0;34m:[0m [0;34m'MutableMapping | str | PathLike[str] | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunk_store[0m[0;34m:[0m [0;34m'MutableMapping | str | PathLike | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m:[0m [0;34m'ZarrWriteModes | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msynchronizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup[0m[0;34m:[0m [0;34m'str | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m:[0m [0;34m'Mapping | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcompute[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[

In [8]:
with ProgressBar():
    ds_chunked.to_zarr('air_temp.zarr', mode='w')

  return to_zarr(  # type: ignore[call-overload,misc]


[########################################] | 100% Completed | 103.83 ms
