# In this notebook we compare the performance of kerchunk vs netcdf posix and https access on read tasks

In [30]:
import glob
import re
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import fsspec
from pathlib import Path
import ujson
import intake_esm
import intake
import fsspec
import aiohttp

In [2]:
import dask
from dask.distributed import Client, performance_report
from dask_jobqueue import PBSCluster

In [3]:
def replace_directory_paths(file_paths, old_directory_path, new_directory_path):
    """Replace the old directory path with a new directory path in the given list of full paths.
    Args:
        file_paths (list): List of full path strings containing the old directory path.
        old_directory_path (str): The old directory path to be removed.
        new_directory_path (str): The new directory path to be prepended.
    Returns:
        list: List of new full paths with the new directory path.
    """
    new_file_paths = []
    for full_path in file_paths:
        # Remove the old directory path
        relative_path = full_path.replace(old_directory_path, "", 1)
        
        # Prepend the new directory path
        new_full_path = os.path.join(new_directory_path, relative_path.lstrip('/\\'))
        new_file_paths.append(new_full_path)
    
    return new_file_paths

In [4]:
import os
import ujson
from fsspec.implementations.local import LocalFileSystem
import kerchunk.hdf
import pandas as pd

In [5]:
# File paths
rda_scratch = "/gpfs/csfs1/collections/rda/scratch/harshah"
rda_data    = "/gpfs/csfs1/collections/rda/data/"
rda_url     =  'https://data.rda.ucar.edu/'
myrda_data  = rda_data + 'harshah/'
#
tas_directory =  myrda_data +'era5_tas/'
zarr_directory   = tas_directory + 'zarr/'  
nc_directory     = tas_directory + 'netcdf/' 
ref_directory = os.path.join(myrda_data,'sidecar','era5')
print(ref_directory)

/gpfs/csfs1/collections/rda/data/harshah/sidecar/era5


In [6]:
def process_urls(file_urls, base_directory):
    """Process URLs to remove #mode=bytes and the initial part of the URL, then prepend a directory path.
    Args:
        file_urls (list): List of URLs to process.
        base_directory (str): Directory path to prepend to the processed paths.
    Returns:
        list: List of processed file paths.
    """
    processed_paths = []
    for url in file_urls:
        # Remove '#mode=bytes'
        clean_url = url.split('#')[0]
        
        # Extract the path starting with 'ds633.0'
        path_parts = clean_url.split('ds633.0/')
        if len(path_parts) == 2:
            relative_path = 'ds633.0/' + path_parts[1]
        else:
            raise ValueError(f"Unexpected URL format: {url}")
        
        # Prepend the base directory
        new_path = os.path.join(base_directory, relative_path)
        processed_paths.append(new_path)
    
    return processed_paths

## Spin up cluster and scale

In [7]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '8GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    log_directory = rda_scratch +'/dask/',
    resource_spec = 'select=1:ncpus=1:mem=8GB',
    queue = 'casper',
    walltime = '2:00:00',
    #interface = 'ib0'
    interface = 'ext'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37337 instead


In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37337/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37337/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.95:46493,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/37337/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [9]:
cluster.scale(5)

## Use glob to get file paths and test posix read speeds

In [10]:
zarr_paths  = sorted(glob.glob(zarr_directory +'*'))
zarr_paths

['/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940080100_1940083123.zarr',
 '/gpfs/csfs1/collections/rda/data/harsh

In [11]:
catalog = intake.open_esm_datastore(rda_scratch + '/intake_catalogs/https/era5_catalog_https.json')
# cat_temp = catalog.search(variable ='t')
# cat_temp
#Select temperatue data for pressure levels (pl) 
cat_pl_july = catalog.search(step_type ='pl',variable='T',month=7)
cat_pl_july 

  df = pd.read_csv(


Unnamed: 0,unique
era_id,1
datatype,1
level_type,0
step_type,1
table_code,1
param_code,1
variable,1
long_name,1
units,1
year,84


In [12]:
# ncfile_urls = cat_temp.df['path'][:12].tolist()
days =31
#We want data for first 3 days of July
ncfile_urls = cat_pl_july.df['path'][:days].to_list()
ncfile_urls[0]

'https://data.rda.ucar.edu/ds633.0/e5.oper.an.pl/194007/e5.oper.an.pl.128_130_t.ll025sc.1940070100_1940070123.nc#mode=bytes'

In [13]:
nc_paths = process_urls(ncfile_urls, rda_data)
nc_paths[-1]

'/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.pl/194007/e5.oper.an.pl.128_130_t.ll025sc.1940073100_1940073123.nc'

### Write these files to a single zarr store

In [14]:
%%time
temp_nc_july = xr.open_mfdataset(nc_paths,engine='netcdf4').T
temp_nc_july

CPU times: user 495 ms, sys: 50.5 ms, total: 546 ms
Wall time: 2.8 s


Unnamed: 0,Array,Chunk
Bytes,106.47 GiB,146.54 MiB
Shape,"(744, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,744 chunks in 63 graph layers,744 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 106.47 GiB 146.54 MiB Shape (744, 37, 721, 1440) (1, 37, 721, 1440) Dask graph 744 chunks in 63 graph layers Data type float32 numpy.ndarray",744  1  1440  721  37,

Unnamed: 0,Array,Chunk
Bytes,106.47 GiB,146.54 MiB
Shape,"(744, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,744 chunks in 63 graph layers,744 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [15]:
# %%time
# temp_nc_july.to_dataset().to_zarr(zarr_directory + 'e5.pl.T.194007.zarr',mode='w')

### Compare read speeds for posix and generate reports

In [16]:
number_of_files_to_read = 3 # Also number of days of data

In [17]:
%%time
## Generate performance report

with performance_report(filename ='posix_zarr_read.html'):
     # tas_zarr = xr.open_mfdataset(zarr_paths[:3],engine='zarr').VAR_2T
     tas_zarr = xr.open_mfdataset(zarr_directory + 'e5.pl.T.194007.zarr', engine='zarr').T
     tas_zarr = tas_zarr.T.isel(time=slice(0,24*number_of_files_to_read))
     tas_zarr = tas_zarr.compute()

CPU times: user 53 s, sys: 12.9 s, total: 1min 5s
Wall time: 1min 51s


In [18]:
%%time
## Generate performance report
with performance_report(filename ='posix_netcdf_read.html'):
     tas_nc = xr.open_mfdataset(nc_paths,engine='netcdf4')
     tas_nc = tas_nc.T.isel(time=slice(0,24*number_of_files_to_read))
     tas_nc = tas_nc.compute()

CPU times: user 5.64 s, sys: 12.2 s, total: 17.9 s
Wall time: 37.8 s


In [19]:
%%time
## Generate performance report
with performance_report(filename ='posix_ref_read.html'):
    url = f'{ref_directory}/era5.pl.t.local.json'
    ref_nc = xr.open_dataset('reference://', engine='zarr', 
                         backend_kwargs={'storage_options':{"fo":url}, 'consolidated':False},)
    ref_nc = ref_nc.T.isel(time=slice(0,24*number_of_files_to_read))
    ref_nc = ref_nc.compute()

CPU times: user 50.6 s, sys: 12.2 s, total: 1min 2s
Wall time: 1min 9s


In [20]:
test_nc  = xr.open_mfdataset(nc_paths[0],engine='netcdf4').T
test_nc

Unnamed: 0,Array,Chunk
Bytes,3.43 GiB,146.54 MiB
Shape,"(24, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.43 GiB 146.54 MiB Shape (24, 37, 721, 1440) (1, 37, 721, 1440) Dask graph 24 chunks in 2 graph layers Data type float32 numpy.ndarray",24  1  1440  721  37,

Unnamed: 0,Array,Chunk
Bytes,3.43 GiB,146.54 MiB
Shape,"(24, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Change posix paths to https and test read speeds

In [21]:
zarr_url = replace_directory_paths([zarr_directory + 'e5.pl.T.194007.zarr'], rda_data, rda_url)
zarr_url

['https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.pl.T.194007.zarr']

In [22]:
# nc_urls  =  replace_directory_paths(nc_paths, rda_data, rda_url)
# nc_urls

#### Test read speeds for netcdf vs zarr

In [23]:
%%time
## Generate performance report
with performance_report(filename ='https_zarr_read.html'):
     tas_zarr_https = xr.open_mfdataset(zarr_url,engine='zarr')
     tas_zarr_https = tas_zarr_https.T.isel(time=slice(0,24*number_of_files_to_read))
     tas_zarr_https = tas_zarr_https.load()

CPU times: user 5.59 s, sys: 12.4 s, total: 18 s
Wall time: 39.7 s


In [24]:
for i,j in enumerate(ncfile_urls):    ncfile_urls[i] = j.replace('data.rda.ucar.edu/ds633.0/e5.oper.an.pl/194007/','thredds.rda.ucar.edu/thredds/fileServer/files/g/ds633.0/e5.oper.an.pl/194007/')ncfile_urls[:number_of_files_to_read]

['https://data.rda.ucar.edu/ds633.0/e5.oper.an.pl/194007/e5.oper.an.pl.128_130_t.ll025sc.1940070100_1940070123.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.pl/194007/e5.oper.an.pl.128_130_t.ll025sc.1940070200_1940070223.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.pl/194007/e5.oper.an.pl.128_130_t.ll025sc.1940070300_1940070323.nc#mode=bytes']

In [31]:
%%time
## Generate performance report
with performance_report(filename ='https_nc_read.html'):
     tas_nc_https = xr.open_mfdataset(ncfile_urls[:number_of_files_to_read],engine='netcdf4')
     #tas_nc_https = xr.open_dataset(ncfile_urls[0])
     print(tas_nc_https)
     tas_nc_https = tas_nc_https.T
     tas_nc_https = tas_nc_https.load()

<xarray.Dataset> Size: 11GB
Dimensions:    (time: 72, level: 37, latitude: 721, longitude: 1440)
Coordinates:
  * latitude   (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0
  * level      (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03
  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8
  * time       (time) datetime64[ns] 576B 1940-07-01 ... 1940-07-03T23:00:00
Data variables:
    T          (time, level, latitude, longitude) float32 11GB dask.array<chunksize=(1, 37, 721, 1440), meta=np.ndarray>
    utc_date   (time) int32 288B dask.array<chunksize=(24,), meta=np.ndarray>
Attributes:
    DATA_SOURCE:          ECMWF: https://cds.climate.copernicus.eu, Copernicu...
    NETCDF_CONVERSION:    CISL RDA: Conversion from ECMWF GRIB 1 data to netC...
    NETCDF_VERSION:       4.8.1
    CONVERSION_PLATFORM:  Linux r4i1n19 4.12.14-95.51-default #1 SMP Fri Apr ...
    CONVERSION_DATE:      Sat Mar 18 19:53:08 MDT 2023
    Conventions:   

RuntimeError: NetCDF: HDF error

In [26]:
%%time
## Generate performance report
with performance_report(filename ='https_ref_read.html'):
    url = f'{ref_directory}/era5.pl.t.remote.json'
    ref_nc = xr.open_dataset('reference://', engine='zarr', 
                         backend_kwargs={'storage_options':{"fo":url}, 'consolidated':False},)
    ref_nc = ref_nc.T.isel(time=slice(0,24*number_of_files_to_read))
    tas_nc = ref_nc.load()

CPU times: user 56.1 s, sys: 14.3 s, total: 1min 10s
Wall time: 1min 15s


In [27]:
tas_nc_https = xr.open_mfdataset(ncfile_urls[:number_of_files_to_read],engine='netcdf4')
tas_nc_https = tas_nc_https.T
tas_nc_https

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,146.54 MiB
Shape,"(72, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,72 chunks in 7 graph layers,72 chunks in 7 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.30 GiB 146.54 MiB Shape (72, 37, 721, 1440) (1, 37, 721, 1440) Dask graph 72 chunks in 7 graph layers Data type float32 numpy.ndarray",72  1  1440  721  37,

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,146.54 MiB
Shape,"(72, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,72 chunks in 7 graph layers,72 chunks in 7 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [28]:
tas_zarr_https = xr.open_mfdataset(zarr_url,engine='zarr')
tas_zarr_https = tas_zarr_https.T.isel(time=slice(0,24*number_of_files_to_read))
tas_zarr_https 

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,146.54 MiB
Shape,"(72, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,72 chunks in 3 graph layers,72 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.30 GiB 146.54 MiB Shape (72, 37, 721, 1440) (1, 37, 721, 1440) Dask graph 72 chunks in 3 graph layers Data type float32 numpy.ndarray",72  1  1440  721  37,

Unnamed: 0,Array,Chunk
Bytes,10.30 GiB,146.54 MiB
Shape,"(72, 37, 721, 1440)","(1, 37, 721, 1440)"
Dask graph,72 chunks in 3 graph layers,72 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
