In [1]:
import xarray
import numpy as np
import zarr
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster, progress
from rechunker import rechunk
from tqdm import tqdm
import gc, shutil
import os

In [2]:
from dask.distributed import Client
client = Client(n_workers=6)

In [3]:
import shutil
import gc

def rechunk_zarr_store(zarr, chunks, target_zarr, steps = 200, 
                       slice_lengths=None, iterate_over='time',
                      chunk_subset=None, overwrite=False):
    
    if slice_lengths is None:
        slice_lengths = int(len(zarr[iterate_over]) / steps)
    if os.path.exists(target_zarr) and overwrite is not True:
        temp = xarray.open_zarr(target_zarr)
        already_rechunked = len(temp[iterate_over])
        temp.close()
    else:
        already_rechunked = 0
    
    # already_rechunked =0
    for i in tqdm(range(0, len(zarr[iterate_over]), slice_lengths)):
        if i + slice_lengths <= already_rechunked:
            print(f'skipped {i}')
            continue

        if i < already_rechunked and i + slice_lengths > already_rechunked:
            zarr_slice = zarr.isel({iterate_over : slice(already_rechunked, i+slice_lengths)})
        else:
            zarr_slice = zarr.isel({iterate_over : slice(i, i + slice_lengths)})
        
        for v in list(zarr_slice.coords.keys()):
            zarr_slice[v].encoding.clear()
                
        for var in zarr_slice:
            zarr_slice[var].encoding.clear()
        
        
        # zarr_slice.chunk({'data':-1, 'time':-1, 'y':-1, 'x':-1})
        if chunk_subset is not None:
            zarr_slice[chunk_subset] = zarr_slice[chunk_subset].chunk(chunks)
        else:
            zarr_slice = zarr_slice.chunk(chunks)
            
        if i == 0:
            zarr_slice.to_zarr(target_zarr, mode="w")
        else:
            zarr_slice.to_zarr(target_zarr, append_dim=iterate_over)
        del zarr_slice
        gc.collect()

In [4]:
seviri = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SEVIRI_FULLDISK.zarr')
seviri

Unnamed: 0,Array,Chunk
Bytes,2.38 TiB,10.16 MiB
Shape,"(11, 245097, 658, 736)","(11, 1, 658, 736)"
Dask graph,245097 chunks in 2 graph layers,245097 chunks in 2 graph layers
Data type,float16 numpy.ndarray,float16 numpy.ndarray
"Array Chunk Bytes 2.38 TiB 10.16 MiB Shape (11, 245097, 658, 736) (11, 1, 658, 736) Dask graph 245097 chunks in 2 graph layers Data type float16 numpy.ndarray",11  1  736  658  245097,

Unnamed: 0,Array,Chunk
Bytes,2.38 TiB,10.16 MiB
Shape,"(11, 245097, 658, 736)","(11, 1, 658, 736)"
Dask graph,245097 chunks in 2 graph layers,245097 chunks in 2 graph layers
Data type,float16 numpy.ndarray,float16 numpy.ndarray


In [None]:
target_zarr = '/scratch/snx3000/kschuurm/ZARR/SEVIRI_FULLDISK_timeseries.zarr'

chunks = {'channel':-1, 'time':-1, 'y':100, 'x':100}

rechunk_zarr_store(seviri, chunks, target_zarr, slice_lengths=2000, 
                   iterate_over='time', 
                   chunk_subset='channel_data',
                  overwrite=True)

 81%|████████▏ | 100/123 [42:41<10:04, 26.30s/it]

In [None]:
seviri2018 = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SEVIRI_2018_2.zarr')
seviri2016_2022 = xarray.open_zarr('/scratch/snx3000/acarpent/SEVIRI_2016-2022.zarr')
seviri2012_2015 = xarray.open_zarr('/scratch/snx3000/acarpent/SEVIRI_2012-2015.zarr')


In [None]:
year_to_ds = {2012: seviri2012_2015, 2013: seviri2012_2015, 2014: seviri2012_2015, 2015: seviri2012_2015,
              2016: seviri2016_2022, 2017: seviri2016_2022, 2018: seviri2018,
              2019: seviri2016_2022, 2020: seviri2016_2022, 2021:seviri2016_2022,
              2022: seviri2016_2022}

In [None]:
slice_lengths = 1000
target_zarr = '/scratch/snx3000/kschuurm/ZARR/SEVIRI_RSS.zarr'
chunks = {'data':-1, 'time':1, 'y':-1, 'x':-1}

for year in tqdm(year_to_ds.keys()):
    zarr = year_to_ds[year].sel(time=str(year))
    
    for i in tqdm(range(0, len(zarr['time']), slice_lengths)):

        zarr_slice = zarr.isel({'time' : slice(i, i + slice_lengths)})
        
        for v in list(zarr_slice.coords.keys()):
            zarr_slice[v].encoding.clear()
                
        for var in zarr_slice:
            zarr_slice[var].encoding.clear()
        
        
        # zarr_slice.chunk({'data':-1, 'time':-1, 'y':-1, 'x':-1})
        zarr_slice['__xarray_dataarray_variable__'] = zarr_slice['__xarray_dataarray_variable__'].chunk(chunks)
            
        if i == 0 and year == 2012:
            zarr_slice.to_zarr(target_zarr, mode="w")
        else:
            zarr_slice.to_zarr(target_zarr, append_dim='time')
        del zarr_slice
        gc.collect()

In [None]:
a = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SARAH3.zip')

In [None]:
a