In [2]:
from pprint import pprint as print
import pandas as pd
import numpy as np
import zarr

## Zarr: N-dimensional arrays with any NumPy dtype
Zarr is a format for the storage of chunked, compressed, N-dimensional arrays inspired by [HDF5](https://www.hdfgroup.org/solutions/hdf5/), [h5py](https://www.h5py.org/) and [bcolz](https://bcolz.readthedocs.io/en/latest/).

## Highlights
- Create N-dimensional arrays with any NumPy dtype.
- Chunk arrays along any dimension.
- Compress and/or filter chunks using any NumCodecs codec.
- Store arrays in memory, on disk, inside a Zip file, on S3, …
- Read an array concurrently from multiple threads or processes.
- Write to an array concurrently from multiple threads or processes.
- Organize arrays into hierarchies via groups.

In [8]:
root = zarr.group(store='root.zarr', overwrite=True)
foo = root.create_group('foo')
bar = foo.create_group('bar')

In [7]:
foo.info

0,1
Name,/foo
Type,zarr.hierarchy.Group
Read-only,False
Store type,zarr.storage.MemoryStore
No. members,1
No. arrays,0
No. groups,1
Groups,bar


In [13]:
bar.zeros??

[0;31mSignature:[0m [0mbar[0m[0;34m.[0m[0mzeros[0m[0;34m([0m[0mname[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mzeros[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Create an array. Keyword arguments as per[0m
[0;34m        :func:`zarr.creation.zeros`."""[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mself[0m[0;34m.[0m[0m_write_op[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_zeros_nosync[0m[0;34m,[0m [0mname[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/zarr/lib/python3.10/site-packages/zarr/hierarchy.py
[0;31mType:[0m      method

In [14]:
bar.create??

[0;31mSignature:[0m [0mbar[0m[0;34m.[0m[0mcreate[0m[0;34m([0m[0mname[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mcreate[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Create an array. Keyword arguments as per[0m
[0;34m        :func:`zarr.creation.create`."""[0m[0;34m[0m
[0;34m[0m        [0;32mreturn[0m [0mself[0m[0;34m.[0m[0m_write_op[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_create_nosync[0m[0;34m,[0m [0mname[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      ~/miniconda3/envs/zarr/lib/python3.10/site-packages/zarr/hierarchy.py
[0;31mType:[0m      method

In [15]:
z1 = bar.zeros('baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4', overwrite=True)
z1.info

0,1
Name,/foo/bar/baz
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,400000000 (381.5M)


In [22]:
# Arrays are known as “datasets” in HDF5 terminology. For compatibility with h5py, Zarr groups also implement the create_dataset() and require_dataset() methods, e.g.:
z = bar.create_dataset('quux', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4', overwrite=True)
z.info

0,1
Name,/foo/bar/quux
Type,zarr.core.Array
Data type,int32
Shape,"(10000, 10000)"
Chunk shape,"(1000, 1000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,400000000 (381.5M)


In [23]:
root.tree()

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, name='foo', nodes=(Node(disabled=True, na…

# Indexing

### Indexing with coordinate arrays

In [24]:
z = zarr.array(np.arange(10) ** 2)
z[:]

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])

In [25]:
z.get_coordinate_selection([2, 5])

array([ 4, 25])

In [27]:
z.set_coordinate_selection([2, 5], [-1, -2]) # To Update values|
z[:]

array([ 0,  1, -1,  9, 16, -2, 36, 49, 64, 81])

In [28]:
# Multidimensional array
z = zarr.array(np.arange(15).reshape(3, 5))
z[:]

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [31]:
z[0,1], z[2,3]

(1, 13)

In [32]:
z.get_coordinate_selection(([0, 2], [1, 3])) # Get the values at the coordinates (0,1) and (2,3)

array([ 1, 13])

In [33]:
z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) # Update the values at the coordinates (0,1) and (2,3)
z[:]

array([[ 0, -1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, -2, 14]])

In [34]:
# VIndex for sqare bracket indexing
z.vindex[[0, 2], [1, 3]]

array([-1, -2])

In [36]:
z.vindex[[0, 2], [1, 3]] = [-3, -4]
z[:]

array([[ 0, -3,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, -4, 14]])

### Boolean Indexing

In [38]:
z = zarr.array(np.arange(10)**2)
z[:]

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])

In [39]:
sel = np.zeros_like(z, dtype=bool)
sel

array([False, False, False, False, False, False, False, False, False,
       False])

In [40]:
sel[2] = True
sel[3] = True
z.get_mask_selection(sel)

array([4, 9])

In [41]:
z.set_mask_selection(sel, [-1, -2])
z[:]

array([ 0,  1, -1, -2, 16, 25, 36, 49, 64, 81])

In [42]:
# Multidimensional
z = zarr.array(np.arange(15).reshape(3, 5))
z[:]

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [43]:
sel = np.zeros_like(z, dtype=bool)
sel[0, 1] = True
sel[2, 3] = True
z.get_mask_selection(sel)

array([ 1, 13])

In [44]:
z.set_mask_selection(sel, [-1, -2])
z[:]

array([[ 0, -1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, -2, 14]])

In [45]:
# Vindex for boolean indexing
print(z.vindex[sel])
z.vindex[sel] = [-3, -4]
z[:]

array([-1, -2])


array([[ 0, -3,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, -4, 14]])

### Orthogonal Indexing
Allows selections to be made along each dimension of an array independently. 

In [46]:
z = zarr.array(np.arange(15).reshape(3, 5))
z[:]

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [47]:
z.get_orthogonal_selection(([0, 2], slice(None)))  # select first and third rows

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [48]:
z.get_orthogonal_selection((slice(None), [1, 3]))  # select second and fourth columns

array([[ 1,  3],
       [ 6,  8],
       [11, 13]])

In [49]:
z.get_orthogonal_selection(([0, 2], [1, 3]))       # select rows [0, 2] and columns [1, 4]

array([[ 1,  3],
       [11, 13]])

In [50]:
# Functionality via oindex
z = zarr.array(np.arange(15).reshape(3, 5))
z.oindex[[0, 2], :]  # select first and third rows

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [51]:
z.oindex[:, [1, 3]]  # select second and fourth columns

array([[ 1,  3],
       [ 6,  8],
       [11, 13]])

In [52]:
z.oindex[[0, 2], [1, 3]]  # select rows [0, 2] and columns [1, 4]

array([[ 1,  3],
       [11, 13]])

In [53]:
z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]]
z[:]

array([[ 0, -1,  2, -2,  4],
       [ 5,  6,  7,  8,  9],
       [10, -3, 12, -4, 14]])

# Block Indexing

Allows selections of whole chunks based on their logical indices along each dimension of an arra

In [54]:
z = zarr.array(np.arange(100).reshape(10, 10), chunks=(3, 3))
z.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(10, 10)"
Chunk shape,"(3, 3)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.KVStore
No. bytes,800
No. bytes stored,1741 (1.7K)


In [57]:
z[:]

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [55]:
# Print number of chunks
print(z.nchunks)

16


In [81]:
for i in range(z.shape[0] // 3):  # Iterate over rows of chunks
    for j in range(z.shape[1] // 3):  # Iterate over columns of chunks
        # chunk slicing
        chunk = z[i * 3:(i + 1) * 3, j * 3:(j + 1) * 3]
        print(f"Chunk at ({i}, {j}):")
        print(chunk)

'Chunk at (0, 0):'
array([[ 0,  1,  2],
       [10, 11, 12],
       [20, 21, 22]])
'Chunk at (0, 1):'
array([[ 3,  4,  5],
       [13, 14, 15],
       [23, 24, 25]])
'Chunk at (0, 2):'
array([[ 6,  7,  8],
       [16, 17, 18],
       [26, 27, 28]])
'Chunk at (1, 0):'
array([[30, 31, 32],
       [40, 41, 42],
       [50, 51, 52]])
'Chunk at (1, 1):'
array([[33, 34, 35],
       [43, 44, 45],
       [53, 54, 55]])
'Chunk at (1, 2):'
array([[36, 37, 38],
       [46, 47, 48],
       [56, 57, 58]])
'Chunk at (2, 0):'
array([[60, 61, 62],
       [70, 71, 72],
       [80, 81, 82]])
'Chunk at (2, 1):'
array([[63, 64, 65],
       [73, 74, 75],
       [83, 84, 85]])
'Chunk at (2, 2):'
array([[66, 67, 68],
       [76, 77, 78],
       [86, 87, 88]])


In [83]:
z.get_block_selection(1)

array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]])

In [82]:
z[3:6]

array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]])

In [92]:
z.blocks[0,:]

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]])

In [91]:
z.blocks[:,0]

array([[ 0,  1,  2],
       [10, 11, 12],
       [20, 21, 22],
       [30, 31, 32],
       [40, 41, 42],
       [50, 51, 52],
       [60, 61, 62],
       [70, 71, 72],
       [80, 81, 82],
       [90, 91, 92]])

In [88]:
z.blocks[0,0]

array([[ 0,  1,  2],
       [10, 11, 12],
       [20, 21, 22]])

In [85]:
z.blocks[0, 1:3]

array([[ 3,  4,  5,  6,  7,  8],
       [13, 14, 15, 16, 17, 18],
       [23, 24, 25, 26, 27, 28]])

In [97]:
z.blocks[...]

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [93]:
# Modification
z = zarr.zeros((6, 6), dtype=int, chunks=2)

In [96]:
z.set_block_selection((1, 0), 1)
z[...] # The `...` is called the Ellipsis object, and it is used to indicate that all axes should be fully included in the selection.

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

## Parallel computing and synchronization

Zarr arrays have been designed for use as the source or sink for data in parallel computations. Zarr arrays have **not** been designed for situations where multiple readers and writers are concurrently operating on the same array.


Both multi-threaded and multi-process parallelism are possible. The bottleneck for most storage and retrieval operations is compression/decompression, and the Python global interpreter lock (GIL) is released wherever possible during these operations, so Zarr will generally not block other Python threads from running.

In [99]:
# Zarr provides support for chunk-level synchronization. E.g., create an array with thread synchronization:
# This array is safe to read or write within a multi-threaded program.
z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', synchronizer=zarr.ThreadSynchronizer())
z # in memory

<zarr.core.Array (10000, 10000) int32>

Zarr also provides support for process synchronization via file locking, provided that all processes have access to a shared file system, and provided that the underlying file system supports file locking (which is not the case for some networked file systems). E.g.:

In [101]:
synchronizer = zarr.ProcessSynchronizer(
                'data/example.sync'
            ) # Path to a directory on a file system that is shared by all processes. N.B., this should be a *different* path to where you store the array data.

z = zarr.open_array(
            'data/example', mode='w', shape=(10000, 10000),
            chunks=(1000, 1000), dtype='i4', synchronizer=synchronizer
        )
z[:] = np.arange(100000000).reshape(10000, 10000)
z # in disk

<zarr.core.Array (10000, 10000) int32>

# Pickling Support

In [111]:
# allowing access to multiple arrays and groups within
zarr.open??

[0;31mSignature:[0m
[0mzarr[0m[0;34m.[0m[0mopen[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mstore[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mzarr[0m[0;34m.[0m[0m_storage[0m[0;34m.[0m[0mstore[0m[0;34m.[0m[0mBaseStore[0m[0;34m,[0m [0mcollections[0m[0;34m.[0m[0mabc[0m[0;34m.[0m[0mMutableMapping[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'a'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mzarr_version[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mopen[0m[0;34m([0m[0mstore[0m[0;34m:[0m [0mStoreLike[0m [0;34m=[0m

In [112]:
# open a single Zarr array within a Zarr store
zarr.open_array??

[0;31mSignature:[0m
[0mzarr[0m[0;34m.[0m[0mopen_array[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mstore[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmode[0m[0;34m=[0m[0;34m'a'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshape[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchunks[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcompressor[0m[0;34m=[0m[0;34m'default'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfill_value[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0morder[0m[0;34m=[0m[0;34m'C'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msynchronizer[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilters[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_metadata[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    

In [118]:
tmp = zarr.open('group.zarr')
tmp.info

0,1
Name,/
Type,zarr.hierarchy.Group
Read-only,False
Store type,zarr.storage.DirectoryStore
No. members,2
No. arrays,2
No. groups,0
Arrays,"bar, foo"


In [120]:
import pickle
z3 = zarr.open_array('data/walnuts.zarr', mode='w', shape=100000, dtype='i8')
z3[:] = np.arange(100000)
z3.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(100000,)"
Chunk shape,"(25000,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,800000 (781.2K)
No. bytes stored,6568 (6.4K)


In [123]:
len(s)

501

In [121]:
s = pickle.dumps(z3)
len(s) < 200  # small because no data have been pickled

False

In [122]:
z4 = pickle.loads(s)
z3 == z4

True