In [1]:
import os
import zarr
import xarray as xr
import subprocess as subp
from rechunker import rechunk
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster

In [2]:
cluster = LocalCluster(
    dashboard_address=":8788", threads_per_worker=1, n_workers=10, memory_limit="30GiB"
)
client = Client(cluster)

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:41836  Dashboard: http://127.0.0.1:8788/status,Cluster  Workers: 10  Cores: 10  Memory: 300.00 GiB


In [4]:
!rm -r /work/scratch-pw/mattjbr/chess_scape/yearchunk/*

rm: cannot remove ‘/work/scratch-pw/mattjbr/chess_scape/yearchunk/*’: No such file or directory


In [None]:
ensmems = ["01", "04", "06", "15"]
varnames = [
    "hurs",
    "huss",
    "pr",
    "psurf",
    "rlds",
    "rsds",
    "sfcWind",
    "tmax",
    "tmean",
    "tmin",
]

maxmem = "20GB"
tempstore = "/work/scratch-pw/mattjbr/chess_scape/tempstore.zarr"
if not os.path.exists(os.path.dirname(tempstore)):
    os.makedirs(os.path.dirname(tempstore))

for ensmem in ensmems:
    for varname in varnames:
        if ensmem != "01":
            if varname == "psurf":
                continue

        print("Processing " + ensmem + " " + varname)

        source_group = zarr.open(
            "/gws/nopw/j04/ceh_generic/matbro/chess_scape/year100kmchunk/ens"
            + ensmem
            + "-year100kmchunk/"
            + varname
            + "_"
            + ensmem
            + "_year100km.zarr"
        )
        targetstore = (
            "/work/scratch-pw/mattjbr/chess_scape/100year10kmchunk/"
            + varname
            + "_"
            + ensmem
            + "_100year10km.zarr"
        )

        if not os.path.exists(os.path.dirname(targetstore)):
            os.makedirs(os.path.dirname(targetstore))

        if varname == "tmean":
            zarrvname = "tas"
        elif varname == "tmax":
            zarrvname = "tasmax"
        elif varname == "tmin":
            zarrvname = "tasmin"
        else:
            zarrvname = varname

        target_chunks = {
            zarrvname: {"time": 36000, "y": 10, "x": 10},
            "lat": {"y": 10, "x": 10},
            "lon": {"y": 10, "x": 10},
            "y": {"y": 10},
            "x": {"x": 10},
            "time": {"time": 36000},
        }

        subp.call(["rm " + tempstore], shell=True)
        array_plan = rechunk(
            source_group, target_chunks, maxmem, targetstore, temp_store=tempstore
        )
        array_plan.execute()

Processing 01 hurs




Processing 01 huss




Processing 01 pr




Processing 01 psurf
Processing 01 rlds
Processing 01 rsds
Processing 01 sfcWind
Processing 01 tmax
Processing 01 tmean
Processing 01 tmin
Processing 04 hurs
Processing 04 huss
Processing 04 pr
Processing 04 rlds
Processing 04 rsds
Processing 04 sfcWind
Processing 04 tmax
Processing 04 tmean
Processing 04 tmin
Processing 06 hurs


---------------------

Optimum chunk size for S3 store is about 250MB, work out what this corresponds to in terms of chunk sizes if just chunking in time

In [22]:
import math

In [61]:
data = xr.open_zarr(
    "/gws/nopw/j04/ceh_generic/matbro/chess_scape/monthchunk/ens01-monthchunk/tmean.zarr"
)

In [62]:
data

Unnamed: 0,Array,Chunk
Bytes,2.65 MiB,2.65 MiB
Shape,"(1057, 656)","(1057, 656)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.65 MiB 2.65 MiB Shape (1057, 656) (1057, 656) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",656  1057,

Unnamed: 0,Array,Chunk
Bytes,2.65 MiB,2.65 MiB
Shape,"(1057, 656)","(1057, 656)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.65 MiB,2.65 MiB
Shape,"(1057, 656)","(1057, 656)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.65 MiB 2.65 MiB Shape (1057, 656) (1057, 656) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",656  1057,

Unnamed: 0,Array,Chunk
Bytes,2.65 MiB,2.65 MiB
Shape,"(1057, 656)","(1057, 656)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.99 GiB,79.35 MiB
Shape,"(36000, 1057, 656)","(30, 1057, 656)"
Count,1201 Tasks,1200 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 92.99 GiB 79.35 MiB Shape (36000, 1057, 656) (30, 1057, 656) Count 1201 Tasks 1200 Chunks Type float32 numpy.ndarray",656  1057  36000,

Unnamed: 0,Array,Chunk
Bytes,92.99 GiB,79.35 MiB
Shape,"(36000, 1057, 656)","(30, 1057, 656)"
Count,1201 Tasks,1200 Chunks
Type,float32,numpy.ndarray


In [31]:
nmbytes = data["tas"].nbytes / 1e6

In [78]:
chunk_size_mbytes = 250
time_chunk = math.ceil(len(data.time) / math.ceil(nmbytes / chunk_size_mbytes))
print(time_chunk)

90


Now calculate chunk sizes for other configurations

In [64]:
def calc_chunk_size(tsize, xsize, ysize):
    return (tsize * xsize * ysize * 4) / 1e6

In [65]:
calc_chunk_size(30, 656, 1057)  # month chunks, no space

83.20704

In [66]:
calc_chunk_size(90, 656, 1057)  # season chunks, no space

249.62112

In [67]:
calc_chunk_size(360, 656, 1057)  # year chunks, no space

998.48448

In [69]:
calc_chunk_size(360, 100, 100)  # year, 100km chunks

14.4

In [70]:
calc_chunk_size(3600, 100, 100)  # decade, 100km chunks

144.0

In [77]:
calc_chunk_size(36000, 10, 10)  # century, 10km chunks

14.4