In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import dask
import os, sys
import glob
import zarr
from joblib import Parallel, delayed
import os
import dask.array as da

sys.path.append('/')
from libraries import *

dates = pd.date_range(start='2011-01-01T00', end='2020-12-31T23', freq='h')
wind_speed_zarr_store = '/media/harish/External_3/CERRA_wind_profiles_and_Chebyshev_coefficients/CERRA_height_level_winds.zarr'
Cheybshev_zarr_store = '/data/harish/CERRA_wind_profiles_and_Chebyshev_coefficients/CERRA_Chebyshev_coefficients.zarr'

In [2]:
# Open the Zarr store
wind_speed_store = zarr.open(wind_speed_zarr_store, mode="r")
print(wind_speed_store.tree())

# Select dimensions
time_dim = wind_speed_store["time"]
height_dim = wind_speed_store["heightAboveGround"]
print(wind_speed_store.wind_speed.chunks)

import itertools
chunk_sizes = wind_speed_store.wind_speed.chunks
# Compute all chunk indices
num_chunks = [dim // chunk_size + (dim % chunk_size > 0) for dim, chunk_size in zip(wind_speed_store.wind_speed.shape, chunk_sizes)]
all_chunks = list(itertools.product(*[range(nc) for nc in num_chunks]))
print("Total chunks:", len(all_chunks))
print("Chunks correspond to 24 hours (1 time chunk) along x and y:", all_chunks[:25])

/
 ├── heightAboveGround (12,) float64
 ├── time (87672,) int64
 └── wind_speed (87672, 1069, 1069, 12) float32
(24, 256, 256, 12)
Total chunks: 91325
Chunks correspond to 24 hours (1 time chunk) along x and y: [(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 1, 0, 0), (0, 1, 1, 0), (0, 1, 2, 0), (0, 1, 3, 0), (0, 1, 4, 0), (0, 2, 0, 0), (0, 2, 1, 0), (0, 2, 2, 0), (0, 2, 3, 0), (0, 2, 4, 0), (0, 3, 0, 0), (0, 3, 1, 0), (0, 3, 2, 0), (0, 3, 3, 0), (0, 3, 4, 0), (0, 4, 0, 0), (0, 4, 1, 0), (0, 4, 2, 0), (0, 4, 3, 0), (0, 4, 4, 0)]


# Initializing a zarr by reading sample data
- Once created, no need to repeat again.

In [4]:
def init_zarr_store(zarr_store, dates):
    template = xr.open_zarr(wind_speed_zarr_store).wind_speed.pipe(xr.zeros_like).isel(time=0,heightAboveGround=0,
                                                                            drop=True).expand_dims(time=len(dates), coeff=poly_order+1)
    template['time'] = dates
    template = template.chunk({'time': 24,'y':256,'x':256,'coeff':5})
    template = template.transpose('time','y','x','coeff')
    template.to_dataset(name = 'Chebyshev_coefficients').to_zarr(zarr_store, compute=False, consolidated=True, mode='w')
'''
Initialize the zarr store, which creates the zarr store in disk, with zeros. 
Once created, better to chose append mode for further operations or else it will overwrite the existing data.
'''
#init_zarr_store(Cheybshev_zarr_store, dates)

'\nInitialize the zarr store, which creates the zarr store in disk, with zeros. \nOnce created, better to chose append mode for further operations or else it will overwrite the existing data.\n'

In [4]:
def find_monthly_indices(year, month,dates=dates):
    """
    Find indices for the given year and month in the dates array.
    """
    monthly_indices = [i for i, date in enumerate(dates) if date.year == year and date.month == month]
    if not monthly_indices:
        raise ValueError(f"No data available for year {year} and month {month}.")
    print(monthly_indices[0], monthly_indices[-1] + 1)
    return monthly_indices[0], monthly_indices[-1] + 1

def Chebyshev_chunk(t_start, t_end, y_start, y_end, x_start, x_end, zarr_store=wind_speed_store):
    """
    Compute Chebyshev coefficients for a specific chunk.
    """
    ds = zarr_store.wind_speed[t_start:t_end, y_start:y_end, x_start:x_end, :]
    
    # Convert the Zarr data to a Dask array
    ds = da.from_array(ds)
    
    # Create an Xarray DataArray with the Dask array and coordinates
    ds = xr.DataArray(
        ds,
        dims=["time", "y", "x", "heightAboveGround"],
        name="wind_speed",
    )
    # Compute Chebyshev coefficients along the "heightAboveGround" dimension
    print(ds.shape)
    return chebyshev_vec(ds.load(), dim="heightAboveGround")

def write_chunk(ds_chunk, region, zarr_store):
    """
    Function to write a single chunk to the Zarr store.
    """
    ds_chunk.to_zarr(zarr_store, region=region, mode="r+")

def write_chebyshev_monthly(zarr_store,year, month, n_jobs=os.cpu_count()):
    """
    Compute and write Chebyshev coefficients for the given year and month.
    """
    # Find time indices for the given month and year
    t_start_global, t_end_global = find_monthly_indices(year, month)
    
    # Define global dimensions
    y = 1069
    x = 1069
    time_batch = 24
    batch_size_yx = 256

    # List to store all tasks
    tasks = []

    # Iterate over time in batches of `time_batch`
    for t_start in range(t_start_global, t_end_global, time_batch):
        t_end = min(t_start + time_batch, t_end_global)
        
        # Iterate over y and x dimensions in batches
        for y_start in range(0, y, batch_size_yx):
            y_end = min(y_start + batch_size_yx, y)
            for x_start in range(0, x, batch_size_yx):
                x_end = min(x_start + batch_size_yx, x)
                
                # Define the region for this chunk
                region = {
                    "time": slice(t_start, t_end),
                    "y": slice(y_start, y_end),
                    "x": slice(x_start, x_end),
                }

                # Add the task to the task list
                tasks.append(
                    delayed(write_chunk)(
                        Chebyshev_chunk(t_start, t_end, y_start, y_end, x_start, x_end),
                        region,zarr_store
                    )
                )

    # Run all tasks in parallel
    with Parallel(n_jobs=n_jobs, verbose=10) as parallel:
        parallel(tasks)


In [None]:
write_chebyshev_monthly(Cheybshev_zarr_store,2020,1)

In [3]:
print("Starting parallel computing...")
import dask.distributed as dd
cluster = dd.LocalCluster(n_workers=128,threads_per_worker=1,memory_limit='2GB',dashboard_address='8787')
# Connect to the cluster
client = dd.Client(cluster)
print(client)

Starting parallel computing...
<Client: 'tcp://127.0.0.1:40367' processes=128 threads=128, memory=238.42 GiB>


In [3]:
# Load the Zarr dataset and subset for one time step
ds = xr.open_zarr(wind_speed_zarr_store).wind_speed.sel(time=('2011-01'))
ds

Unnamed: 0,Array,Chunk
Bytes,38.01 GiB,72.00 MiB
Shape,"(744, 1069, 1069, 12)","(24, 256, 256, 12)"
Dask graph,775 chunks in 3 graph layers,775 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 38.01 GiB 72.00 MiB Shape (744, 1069, 1069, 12) (24, 256, 256, 12) Dask graph 775 chunks in 3 graph layers Data type float32 numpy.ndarray",744  1  12  1069  1069,

Unnamed: 0,Array,Chunk
Bytes,38.01 GiB,72.00 MiB
Shape,"(744, 1069, 1069, 12)","(24, 256, 256, 12)"
Dask graph,775 chunks in 3 graph layers,775 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [5]:
chebyshev_vec(ds, dim="heightAboveGround").load()