In [None]:
import xarray
import numpy as np
import zarr
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster, progress
from rechunker import rechunk
from tqdm import tqdm
import pandas as pd
import gc
import os

In [None]:
from dask.distributed import Client
client = Client(n_workers=32)
client

In [None]:
import shutil
import gc

def rechunk_zarr_store(zarr, chunks, target_zarr, steps = 200, 
                       slice_lengths=None, iterate_over='time',
                      chunk_subset=None):
    
    if slice_lengths is None:
        slice_lengths = int(len(zarr[iterate_over]) / steps)
    if os.path.exists(target_zarr):
        temp = xarray.open_zarr(target_zarr)
        already_rechunked = len(temp[iterate_over])
        temp.close()
    else:
        already_rechunked = 0
    
    # already_rechunked =0
    for i in tqdm(range(0, len(zarr[iterate_over]), slice_lengths)):
        if i + slice_lengths <= already_rechunked:
            print(f'skipped {i}')
            continue

        if i < already_rechunked and i + slice_lengths > already_rechunked:
            zarr_slice = zarr.isel({iterate_over : slice(already_rechunked, i+slice_lengths)})
        else:
            zarr_slice = zarr.isel({iterate_over : slice(i, i + slice_lengths)})
        
        for v in list(zarr_slice.coords.keys()):
            if zarr_slice.coords[v].dtype == object:
                zarr_slice[v].encoding.clear()
                
        for var in zarr_slice:
            zarr_slice[var].encoding.clear()
        
        if chunk_subset is not None:
            zarr_slice[chunk_subset] = zarr_slice[chunk_subset].chunk(chunks)
        else:
            zarr_slice = zarr_slice.chunk(chunks)
            
        if i == 0:
            zarr_slice.to_zarr(target_zarr, mode="w")
        else:
            zarr_slice.to_zarr(target_zarr, append_dim=iterate_over)
        del zarr_slice
        gc.collect()

In [None]:
fulldisk = xarray.open_zarr('/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK.zarr')

# fulldisk2 = xarray.open_zarr('/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK.zarr')


In [None]:
target_zarr = '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_timechunked.zarr'

chunks = {'channel':-1, 'time':-1, 'y':15, 'x':15}

rechunk_zarr_store(fulldisk, chunks, target_zarr, slice_lengths=10000, 
                   iterate_over='time', chunk_subset='channel_data')

In [None]:
test = = xarray.open_zarr('/scratch/snx3000/kschuurm/ZARR/SEVIRI_2018.zarr')

In [None]:
sarah.transpose('data', 'time', 'y','x')

In [None]:
xarray.open_zarr('/scratch/snx3000/acarpent/SEVIRI_2016-2022.zarr')

# Combine ZARR|

In [None]:

target_zarr = '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK.zarr'

zarrstores_fn = {
    2016: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2016.zarr',
    2017: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2017.zarr',
    2018: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2018.zarr',
    2019: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2019.zarr',
    2020: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2020.zarr',
    2021: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2021.zarr',
    2022: '/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK_2022.zarr',
}

zarrstores = {k: xarray.open_zarr(v) for k,v in zarrstores_fn.items()}

In [None]:
first = True

already_processed = None

if os.path.exists(target_zarr):
    first = False
    a = xarray.open_zarr(target_zarr)
    already_processed = a.time.values
    a.close()
    
for year, store in zarrstores.items():
    
    yr_store = store.sel(time=(store.time.dt.year == year)).drop_duplicates(dim='time', keep='first')
    timeindex = np.sort(np.array(list(set(yr_store.time.values))))
    timeindex = pd.DatetimeIndex(timeindex)
    idx_start = 0
    idx_end = len(timeindex)
    print(idx_end)
    
    steps = 20000
    for istart in tqdm(range(idx_start, idx_end, steps)):
        iend = min([istart+steps, idx_end])
        
        times = timeindex[slice(istart, iend)]
        ds_slice = yr_store.sel(time=times)


        for v in list(ds_slice.coords.keys()):
#             if ds_slice.coords[v].dtype == object:
            ds_slice[v].encoding.clear()

        for var in ds_slice:
            ds_slice[var].encoding.clear()

        ds_slice['channel_data'] = ds_slice['channel_data'].chunk({'channel':-1, 'time':1, 'x':-1, 'y':-1})
        ds_slice['time'] = ds_slice.time.chunk({'time':-1})


        if first:
            ds_slice.to_zarr(target_zarr, mode='w')
            first = False
        else:
            ds_slice.to_zarr(target_zarr, append_dim ='time')

        gc.collect()


In [None]:
import zarr
import xarray

fulldisk = xarray.open_zarr('/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK.zarr')


store = zarr.storage.ZipStore('/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK.zip')

for v in list(fulldisk.coords.keys()):
    fulldisk[v].encoding.clear()

for var in fulldisk:
    fulldisk[var].encoding.clear()

fulldisk['channel_data'] = fulldisk['channel_data'].chunk({'channel':-1, 'time':1, 'x':-1, 'y':-1})
fulldisk['time'] = fulldisk.time.chunk({'time':-1})
fulldisk.to_zarr(store)

In [None]:

fulldisk = xarray.open_zarr('/capstor/scratch/cscs/kschuurm/ZARR/SEVIRI_FULLDISK.zarr')

In [None]:
fulldisk

In [None]:
fulldisk.drop_duplicates(dim='time')