In [18]:
import glob
import re

import matplotlib as plt
import numpy as np
import scipy as sp
import xarray as xr

In [2]:
import dask
from dask.distributed import Client, performance_report
from dask_jobqueue import PBSCluster

In [3]:
# File paths
rda_scratch = "/gpfs/csfs1/collections/rda/scratch/harshah"
rda_data = "/gpfs/csfs1/collections/rda/data/"
era5_path = rda_data + "ds633.0/e5.oper.an.sfc/"
zarr_path = rda_scratch + "/tas_zarr/"

In [4]:
## Find NetCDF files with tas (Surface air temperature at 2m) using glob and a search pattern
tas_pattern = era5_path + "**/e5.oper.an.sfc.128_167_2t.*.nc"
tas_ncfiles = glob.glob(tas_pattern, recursive=True)

In [5]:
len(tas_ncfiles)

1009

In [6]:
tas_ncfiles[0]

'/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/202207/e5.oper.an.sfc.128_167_2t.ll025sc.2022070100_2022073123.nc'

In [7]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name="/dask_job/dask-wk23-hpc",
    cores=1,
    memory="8GiB",
    processes=1,
    local_directory=rda_scratch + "/dask/spill",
    resource_spec="select=1:ncpus=1:mem=8GB",
    queue="casper",
    walltime="5:30:00",
    # interface = 'ib0'
    interface="ext",
)

In [8]:
# GMST function ###
# calculate global means


def get_lat_name(ds):
    for lat_name in ["lat", "latitude"]:
        if lat_name in ds.coords:
            return lat_name
    raise RuntimeError("Couldn't find a latitude coordinate")


def global_mean(ds):
    lat = ds[get_lat_name(ds)]
    weight = np.cos(np.deg2rad(lat))
    weight /= weight.mean()
    other_dims = set(ds.dims) - {"time"}
    return (ds * weight).mean(other_dims)

In [9]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.119:38713,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [10]:
# Define a key function that extracts the YYYYMM part
def extract_date(filename):
    # Using regular expression to find the first occurrence of a pattern resembling 'YYYYMM'
    match = re.search(r"/(\d{6})/", filename)
    if match:
        return match.group(1)
    else:
        return filename  # Return the original filename if pattern is not found


# Sort the list using the key function
sorted_ncfiles = sorted(tas_ncfiles, key=extract_date)
###
sorted_ncfiles[1]

'/gpfs/csfs1/collections/rda/data/ds633.0/e5.oper.an.sfc/194002/e5.oper.an.sfc.128_167_2t.ll025sc.1940020100_1940022923.nc'

In [11]:
#
lat_chunksize = 139
lon_chuksize = 277
time_chunksize = 240

##########


def process_batch(start_index, end_index, ncfile_list, zfile_path):
    # Generate a Zarr store name based on the batch indices
    # ncfile_list = List containing filenames of NetCDF files, # zfile_path = zarr_file_path
    #
    start_date = extract_date(ncfile_list[start_index])
    end_date = extract_date(ncfile_list[end_index - 1])
    #
    zarr_store_name = zfile_path + f"tas2m_{start_date}_{end_date}.zarr"
    print(zarr_store_name)
    # Read the files in the current batch into a single xarray dataset
    datasets = xr.open_mfdataset(
        ncfile_list[start_index:end_index], combine="nested", concat_dim="time"
    ).VAR_2T

    # Rechunk the dataset to holed 10 days-worth of data in a chunk
    rechunked_dataset = datasets.chunk(
        {"time": time_chunksize, "latitude": lat_chunksize, "longitude": lon_chuksize}
    )

    # Save the combined dataset to a Zarr file
    rechunked_dataset.to_zarr(zarr_store_name, mode="w")

## Let us rewrite these files into a zarr store

In [12]:
# Scale the cluster to n workers
cluster.scale(20)

In [19]:
cluster

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Workers: 20
Total threads: 20,Total memory: 160.00 GiB

0,1
Comm: tcp://128.117.208.119:38713,Workers: 20
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/8787/status,Total threads: 20
Started: 12 minutes ago,Total memory: 160.00 GiB

0,1
Comm: tcp://128.117.208.109:44529,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/40613/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.109:33503,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-gtjitexb,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-gtjitexb

0,1
Comm: tcp://128.117.208.103:36211,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/43873/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.103:44633,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-djjziq3w,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-djjziq3w

0,1
Comm: tcp://128.117.208.103:40169,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/36873/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.103:46693,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-o91bduu3,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-o91bduu3

0,1
Comm: tcp://128.117.208.112:37451,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/33845/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.112:37053,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-osaax9gk,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-osaax9gk

0,1
Comm: tcp://128.117.208.103:39935,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/43345/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.103:46829,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-8nk1gh_e,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-8nk1gh_e

0,1
Comm: tcp://128.117.208.109:41111,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/36955/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.109:37745,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-e8zo3teq,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-e8zo3teq

0,1
Comm: tcp://128.117.208.103:41461,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/38659/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.103:36857,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-gkkcikxv,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-gkkcikxv

0,1
Comm: tcp://128.117.208.109:43387,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/39121/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.109:42601,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-qv6rt2hn,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-qv6rt2hn

0,1
Comm: tcp://128.117.208.112:41065,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44431/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.112:44691,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-62xwgg_4,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-62xwgg_4

0,1
Comm: tcp://128.117.208.112:35711,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/33077/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.112:41319,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-bgbidtlj,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-bgbidtlj

0,1
Comm: tcp://128.117.208.112:43617,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/35913/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.112:40831,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-qx3godz4,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-qx3godz4

0,1
Comm: tcp://128.117.208.109:39975,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/44843/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.109:34737,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-dbn8nonw,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-dbn8nonw

0,1
Comm: tcp://128.117.208.88:40243,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/38037/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.88:46011,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-8pz1_6m4,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-8pz1_6m4

0,1
Comm: tcp://128.117.208.109:40743,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/36007/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.109:46695,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-2ihkf1wu,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-2ihkf1wu

0,1
Comm: tcp://128.117.208.103:42265,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/41607/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.103:44745,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-g5yuetn1,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-g5yuetn1

0,1
Comm: tcp://128.117.208.88:39897,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/40939/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.88:35979,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-8jua02ij,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-8jua02ij

0,1
Comm: tcp://128.117.208.103:34955,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/39475/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.103:44601,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-d7dqlpep,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-d7dqlpep

0,1
Comm: tcp://128.117.208.112:34079,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/46763/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.112:37163,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-c31ourbi,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-c31ourbi

0,1
Comm: tcp://128.117.208.109:42071,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/46559/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.109:38803,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-9tvzykyt,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-9tvzykyt

0,1
Comm: tcp://128.117.208.112:39925,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/38395/status,Memory: 8.00 GiB
Nanny: tcp://128.117.208.112:36275,
Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-stnx8j9c,Local directory: /gpfs/csfs1/collections/rda/scratch/harshah/dask/spill/dask-scratch-space/worker-stnx8j9c


In [15]:
%%time
# Specify batch size
batch_size = 120
for start_index in np.arange(0, len(sorted_ncfiles), batch_size):
    # for start_index in np.arange(0,249,batch_size):
    end_index = min(start_index + batch_size, len(sorted_ncfiles))
    print(end_index)
    # Process the current batch
    process_batch(start_index, end_index, sorted_ncfiles, zarr_path)
    print(f"Processed files {start_index} to {end_index}")

print("All files have been processed.")

1009
/gpfs/csfs1/collections/rda/scratch/harshah/tas_zarr/tas2m_202001_202401.zarr


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Processed files 960 to 1009
All files have been processed.
CPU times: user 6min 42s, sys: 12.5 s, total: 6min 55s
Wall time: 7min 33s


In [14]:
%%time
tas = xr.open_mfdataset(
    zarr_path + "tas2m*.zarr", combine="nested", concat_dim="time", engine="zarr"
).VAR_2T
tas

CPU times: user 292 ms, sys: 87.6 ms, total: 379 ms
Wall time: 963 ms


Unnamed: 0,Array,Chunk
Bytes,2.78 TiB,35.25 MiB
Shape,"(737088, 721, 1440)","(240, 139, 277)"
Dask graph,110808 chunks in 19 graph layers,110808 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.78 TiB 35.25 MiB Shape (737088, 721, 1440) (240, 139, 277) Dask graph 110808 chunks in 19 graph layers Data type float32 numpy.ndarray",1440  721  737088,

Unnamed: 0,Array,Chunk
Bytes,2.78 TiB,35.25 MiB
Shape,"(737088, 721, 1440)","(240, 139, 277)"
Dask graph,110808 chunks in 19 graph layers,110808 chunks in 19 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [15]:
%%time
tas_daily = tas.resample(time="1D").mean()
tas_daily

CPU times: user 1min 9s, sys: 1.74 s, total: 1min 11s
Wall time: 1min 17s


Unnamed: 0,Array,Chunk
Bytes,118.79 GiB,150.40 kiB
Shape,"(30712, 721, 1440)","(1, 139, 277)"
Dask graph,1105632 chunks in 92156 graph layers,1105632 chunks in 92156 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 118.79 GiB 150.40 kiB Shape (30712, 721, 1440) (1, 139, 277) Dask graph 1105632 chunks in 92156 graph layers Data type float32 numpy.ndarray",1440  721  30712,

Unnamed: 0,Array,Chunk
Bytes,118.79 GiB,150.40 kiB
Shape,"(30712, 721, 1440)","(1, 139, 277)"
Dask graph,1105632 chunks in 92156 graph layers,1105632 chunks in 92156 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [20]:
%%time
tas_daily = tas_daily.chunk({"latitude": 139, "longitude": 544, "time": 1000})
tas_daily

CPU times: user 1.59 ms, sys: 0 ns, total: 1.59 ms
Wall time: 1.6 ms


Unnamed: 0,Array,Chunk
Bytes,118.79 GiB,288.45 MiB
Shape,"(30712, 721, 1440)","(1000, 139, 544)"
Dask graph,558 chunks in 92157 graph layers,558 chunks in 92157 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 118.79 GiB 288.45 MiB Shape (30712, 721, 1440) (1000, 139, 544) Dask graph 558 chunks in 92157 graph layers Data type float32 numpy.ndarray",1440  721  30712,

Unnamed: 0,Array,Chunk
Bytes,118.79 GiB,288.45 MiB
Shape,"(30712, 721, 1440)","(1000, 139, 544)"
Dask graph,558 chunks in 92157 graph layers,558 chunks in 92157 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Let us now compare the writing speed for NetCDF vs zarr formats

In [17]:
%%time
## Generate performance report
with performance_report(filename="e5_zarr_report.html"):
    tas_daily.to_dataset().to_zarr(zarr_path + "e5_tas2m_daily_1940_2023.zarr", mode="w" )

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


CPU times: user 32min 11s, sys: 1min 44s, total: 33min 56s
Wall time: 40min 57s


In [None]:
%%time
## Generate performance report
with performance_report(filename="e5_nc_report.html"):
    tas_daily.to_dataset().to_netcdf(zarr_path + "e5_tas2m_daily_1940_2023.nc", mode="w")

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [None]:
################################################

### Comapare GMST calculation