In [None]:
import xarray
import numpy as np
import zarr
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster, progress
from rechunker import rechunk
from tqdm import tqdm
import gc
import os

In [None]:
from dask.distributed import Client
client = Client(n_workers=12)

In [None]:
import shutil
import gc

def rechunk_zarr_store(zarr, chunks, target_zarr, steps = 200, 
                       slice_lengths=None, iterate_over='time',
                      chunk_subset=None, overwrite=False):
    
    if slice_lengths is None:
        slice_lengths = int(len(zarr[iterate_over]) / steps)
    if os.path.exists(target_zarr) and overwrite is not True:
        temp = xarray.open_zarr(target_zarr)
        already_rechunked = len(temp[iterate_over])
        temp.close()
    else:
        already_rechunked = 0
    
    # already_rechunked =0
    for i in tqdm(range(0, len(zarr[iterate_over]), slice_lengths)):
        if i + slice_lengths <= already_rechunked:
            print(f'skipped {i}')
            continue

        if i < already_rechunked and i + slice_lengths > already_rechunked:
            zarr_slice = zarr.isel({iterate_over : slice(already_rechunked, i+slice_lengths)})
        else:
            zarr_slice = zarr.isel({iterate_over : slice(i, i + slice_lengths)})
        
        for v in list(zarr_slice.coords.keys()):
            zarr_slice[v].encoding.clear()
                
        for var in zarr_slice:
            zarr_slice[var].encoding.clear()
        
        
        # zarr_slice.chunk({'data':-1, 'time':-1, 'y':-1, 'x':-1})
        if chunk_subset is not None:
            zarr_slice[chunk_subset] = zarr_slice[chunk_subset].chunk(chunks)
        else:
            zarr_slice = zarr_slice.chunk(chunks)
            
        if i == 0:
            zarr_slice.to_zarr(target_zarr, mode="w")
        else:
            zarr_slice.to_zarr(target_zarr, append_dim=iterate_over)
        del zarr_slice
        gc.collect()

In [None]:
sarah = xarray.open_zarr('../../ZARR/SARAH3_timeseries_chunk.zarr')
sarah

In [38]:
target_zarr = '/scratch/snx3000/kschuurm/ZARR/SARAH3.zarr'

chunks = {'data':-1, 'time':1, 'y':-1, 'x':-1}

rechunk_zarr_store(seviri18, chunks, target_zarr, slice_lengths=192, 
                   iterate_over='time', 
                   chunk_subset='__xarray_dataarray_variable__',
                  overwrite=True)

100%|██████████| 155/155 [08:17<00:00,  3.21s/it]


In [27]:
test = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SEVIRI_2018_2.zarr')

In [4]:
seviri2018 = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SEVIRI_2018_2.zarr')
seviri2016_2022 = xarray.open_zarr('/scratch/snx3000/acarpent/SEVIRI_2016-2022.zarr')
seviri2012_2015 = xarray.open_zarr('/scratch/snx3000/acarpent/SEVIRI_2012-2015.zarr')


In [5]:
year_to_ds = {2012: seviri2012_2015, 2013: seviri2012_2015, 2014: seviri2012_2015, 2015: seviri2012_2015,
              2016: seviri2016_2022, 2017: seviri2016_2022, 2018: seviri2018,
              2019: seviri2016_2022, 2020: seviri2016_2022, 2021:seviri2016_2022,
              2022: seviri2016_2022}

In [9]:
slice_lengths = 1000
target_zarr = '/scratch/snx3000/kschuurm/ZARR/SEVIRI_RSS.zarr'
chunks = {'data':-1, 'time':1, 'y':-1, 'x':-1}

for year in tqdm(year_to_ds.keys()):
    zarr = year_to_ds[year].sel(time=str(year))
    
    for i in tqdm(range(0, len(zarr['time']), slice_lengths)):

        zarr_slice = zarr.isel({'time' : slice(i, i + slice_lengths)})
        
        for v in list(zarr_slice.coords.keys()):
            zarr_slice[v].encoding.clear()
                
        for var in zarr_slice:
            zarr_slice[var].encoding.clear()
        
        
        # zarr_slice.chunk({'data':-1, 'time':-1, 'y':-1, 'x':-1})
        zarr_slice['__xarray_dataarray_variable__'] = zarr_slice['__xarray_dataarray_variable__'].chunk(chunks)
            
        if i == 0 and year == 2012:
            zarr_slice.to_zarr(target_zarr, mode="w")
        else:
            zarr_slice.to_zarr(target_zarr, append_dim='time')
        del zarr_slice
        gc.collect()

  0%|          | 0/11 [00:00<?, ?it/s]
  0%|          | 0/29 [00:00<?, ?it/s][A
  3%|▎         | 1/29 [00:09<04:25,  9.47s/it][A
  7%|▋         | 2/29 [00:16<03:39,  8.14s/it][A
 10%|█         | 3/29 [00:24<03:22,  7.78s/it][A
 14%|█▍        | 4/29 [00:33<03:26,  8.27s/it][A
 17%|█▋        | 5/29 [00:43<03:35,  8.97s/it][A
 21%|██        | 6/29 [00:52<03:28,  9.06s/it][A
 24%|██▍       | 7/29 [01:02<03:23,  9.23s/it][A
 28%|██▊       | 8/29 [01:13<03:24,  9.76s/it][A
 31%|███       | 9/29 [01:22<03:11,  9.58s/it][A
 34%|███▍      | 10/29 [01:30<02:54,  9.17s/it][A
 38%|███▊      | 11/29 [01:40<02:51,  9.54s/it][A
 41%|████▏     | 12/29 [01:52<02:52, 10.16s/it][A
 45%|████▍     | 13/29 [02:02<02:43, 10.19s/it][A
 48%|████▊     | 14/29 [02:12<02:31, 10.11s/it][A
 52%|█████▏    | 15/29 [02:23<02:26, 10.44s/it][A
 55%|█████▌    | 16/29 [02:33<02:14, 10.32s/it][A
 59%|█████▊    | 17/29 [02:43<01:59,  9.97s/it][A
 62%|██████▏   | 18/29 [02:55<01:58, 10.80s/it][A
 66%|█████

In [None]:
a = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SARAH3.zip')

In [None]:
a