# In this notebook we compare the performance of kerchunk vs netcdf posix and https access on read tasks

In [1]:
import glob
import re
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import fsspec
from pathlib import Path
import ujson
import intake_esm
import intake
import fsspec

In [2]:
import dask
from dask.distributed import Client, performance_report
from dask_jobqueue import PBSCluster

In [3]:
def replace_directory_paths(file_paths, old_directory_path, new_directory_path):
    """Replace the old directory path with a new directory path in the given list of full paths.
    Args:
        file_paths (list): List of full path strings containing the old directory path.
        old_directory_path (str): The old directory path to be removed.
        new_directory_path (str): The new directory path to be prepended.
    Returns:
        list: List of new full paths with the new directory path.
    """
    new_file_paths = []
    for full_path in file_paths:
        # Remove the old directory path
        relative_path = full_path.replace(old_directory_path, "", 1)
        
        # Prepend the new directory path
        new_full_path = os.path.join(new_directory_path, relative_path.lstrip('/\\'))
        new_file_paths.append(new_full_path)
    
    return new_file_paths

In [4]:
import os
import ujson
from fsspec.implementations.local import LocalFileSystem
import kerchunk.hdf
import pandas as pd

In [5]:
# File paths
rda_scratch = "/gpfs/csfs1/collections/rda/scratch/harshah"
rda_data    = "/gpfs/csfs1/collections/rda/data/"
rda_url     =  'https://data.rda.ucar.edu/'
myrda_data  = rda_data + 'harshah/'
#
tas_directory =  myrda_data +'era5_tas/'
zarr_directory   = tas_directory + 'zarr/'  
nc_directory     = tas_directory + 'netcdf/' 

In [6]:
# def create_new_file_paths(file_urls, output_directory):
#     """Create new file paths for the rechunked files."""
#     new_file_paths = []
#     for file_url in file_urls:
#         # Extract the file name from the file URL and remove '#mode=bytes'
#         file_name = os.path.basename(file_url).replace('#mode=bytes', '')
#         # Create the new file path
#         new_file_path = os.path.join(output_directory, file_name)
#         new_file_paths.append(new_file_path)
#     return new_file_paths


def process_urls(file_urls, base_directory):
    """Process URLs to remove #mode=bytes and the initial part of the URL, then prepend a directory path.
    Args:
        file_urls (list): List of URLs to process.
        base_directory (str): Directory path to prepend to the processed paths.
    Returns:
        list: List of processed file paths.
    """
    processed_paths = []
    for url in file_urls:
        # Remove '#mode=bytes'
        clean_url = url.split('#')[0]
        
        # Extract the path starting with 'ds633.0'
        path_parts = clean_url.split('ds633.0/')
        if len(path_parts) == 2:
            relative_path = 'ds633.0/' + path_parts[1]
        else:
            raise ValueError(f"Unexpected URL format: {url}")
        
        # Prepend the base directory
        new_path = os.path.join(base_directory, relative_path)
        processed_paths.append(new_path)
    
    return processed_paths

## Spin up cluster and scale

In [7]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '8GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    log_directory = rda_scratch +'/dask/',
    resource_spec = 'select=1:ncpus=1:mem=8GB',
    queue = 'casper',
    walltime = '2:00:00',
    #interface = 'ib0'
    interface = 'ext'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45377 instead


In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/45377/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/45377/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.96:42373,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/45377/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [9]:
cluster.scale(10)

## Use glob to get file paths and test posix read speeds

In [10]:
zarr_paths  = sorted(glob.glob(zarr_directory +'*'))
zarr_paths

['/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.zarr',
 '/gpfs/csfs1/collections/rda/data/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940080100_1940083123.zarr',
 '/gpfs/csfs1/collections/rda/data/harsh

In [11]:
catalog = intake.open_esm_datastore(rda_scratch + '/intake_catalogs/https/era5_catalog_https.json')
cat_temp = catalog.search(variable ='VAR_2T')
cat_temp

  df = pd.read_csv(


Unnamed: 0,unique
era_id,1
datatype,1
level_type,0
step_type,1
table_code,1
param_code,1
variable,1
long_name,1
units,1
year,85


In [12]:
ncfile_urls = cat_temp.df['path'][:12].tolist()
ncfile_urls

['https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194001/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194002/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194003/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194004/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194005/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194006/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194007/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.nc#mode=bytes',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194008/e5.oper.an.sfc.128_167_2t.ll025s

In [13]:
nc_paths = process_urls(ncfile_urls, rda_data)
nc_paths

['/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194001/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194002/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194003/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194004/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194005/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194006/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194007/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.nc',
 '/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194008/e5.oper.an.sfc.128_167_2t.ll025sc.1940080100_19400831

### Compare read speeds for posix and generate reports

In [14]:
%%time
## Generate performance report
with performance_report(filename ='posix_zarr_read.html'):
     tas_zarr = xr.open_mfdataset(zarr_paths[:3],engine='zarr').VAR_2T
     tas_zarr = tas_zarr.compute()

CPU times: user 11.7 s, sys: 37.1 s, total: 48.8 s
Wall time: 1min 19s


In [15]:
%%time
## Generate performance report
with performance_report(filename ='posix_netcdf_read.html'):
     tas_nc = xr.open_mfdataset(nc_paths[:3],engine='netcdf4').VAR_2T
     tas_nc = tas_nc.compute()

CPU times: user 21.4 s, sys: 42 s, total: 1min 3s
Wall time: 1min 27s


In [16]:
test_nc  = xr.open_mfdataset(nc_paths[0],engine='netcdf4').VAR_2T
test_nc

Unnamed: 0,Array,Chunk
Bytes,2.88 GiB,3.97 MiB
Shape,"(744, 721, 1440)","(27, 139, 277)"
Dask graph,1008 chunks in 2 graph layers,1008 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.88 GiB 3.97 MiB Shape (744, 721, 1440) (27, 139, 277) Dask graph 1008 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  721  744,

Unnamed: 0,Array,Chunk
Bytes,2.88 GiB,3.97 MiB
Shape,"(744, 721, 1440)","(27, 139, 277)"
Dask graph,1008 chunks in 2 graph layers,1008 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [17]:
test_zarr = xr.open_mfdataset(zarr_paths[0],engine='zarr').VAR_2T
test_zarr

Unnamed: 0,Array,Chunk
Bytes,2.88 GiB,3.97 MiB
Shape,"(744, 721, 1440)","(27, 139, 277)"
Dask graph,1008 chunks in 2 graph layers,1008 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.88 GiB 3.97 MiB Shape (744, 721, 1440) (27, 139, 277) Dask graph 1008 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  721  744,

Unnamed: 0,Array,Chunk
Bytes,2.88 GiB,3.97 MiB
Shape,"(744, 721, 1440)","(27, 139, 277)"
Dask graph,1008 chunks in 2 graph layers,1008 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Change posix paths to https and test read speeds

In [18]:
zarr_urls = replace_directory_paths(zarr_paths, rda_data, rda_url)
zarr_urls

['https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940080100_1940083123.zarr',
 'https://data.rda.ucar.edu/harshah/era5_tas/zarr/e5.oper.an.sfc.128_167_2t.ll025sc.1940090100_1

In [19]:
nc_urls  =  replace_directory_paths(nc_paths, rda_data, rda_url)
nc_urls 

['https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194001/e5.oper.an.sfc.128_167_2t.ll025sc.1940010100_1940013123.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194002/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194003/e5.oper.an.sfc.128_167_2t.ll025sc.1940030100_1940033123.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194004/e5.oper.an.sfc.128_167_2t.ll025sc.1940040100_1940043023.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194005/e5.oper.an.sfc.128_167_2t.ll025sc.1940050100_1940053123.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194006/e5.oper.an.sfc.128_167_2t.ll025sc.1940060100_1940063023.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194007/e5.oper.an.sfc.128_167_2t.ll025sc.1940070100_1940073123.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.sfc/194008/e5.oper.an.sfc.128_167_2t.ll025sc.1940080100_1940083123.nc',
 'https://data.rda.ucar.edu/ds633.0/e5.oper.an.s

#### Test read speeds for netcdf vs zarr

In [20]:
%%time
## Generate performance report
with performance_report(filename ='https_zarr_read.html'):
     tas_zarr_https = xr.open_mfdataset(zarr_urls[:3],engine='zarr').VAR_2T
     tas_zarr_https = tas_zarr_https.compute()

CPU times: user 21 s, sys: 27.9 s, total: 48.9 s
Wall time: 2min 18s


In [22]:
%%time
## Generate performance report
with performance_report(filename ='https_nc_read.html'):
     tas_nc_https = xr.open_mfdataset(ncfile_urls[:3],engine='netcdf4').VAR_2T
     tas_nc_https = tas_nc_https.compute()

CPU times: user 37.2 s, sys: 27.1 s, total: 1min 4s
Wall time: 3min 36s
