# Generate annual/yearly zarr stores from hourly ERA5 data files

In [15]:
import glob
import re
import matplotlib as plt
import numpy as np
import scipy as sp
import xarray as xr
import intake
import intake_esm
import pandas as pd

In [16]:
import dask
from dask.distributed import Client, performance_report
from dask_jobqueue import PBSCluster

In [17]:
# File paths
rda_scratch = "/gpfs/csfs1/collections/rda/scratch/harshah"
rda_data    = "/gpfs/csfs1/collections/rda/data/"
#########
rda_url           = 'https://data.rda.ucar.edu/'
era5_catalog      = rda_url + 'pythia_era5_24/pythia_intake_catalogs/era5_catalog.json'
#alternate_catalog = rda_data + 'pythia_era5_24/pythia_intake_catalogs/era5_catalog_opendap.json'
annual_means      =  rda_data + 'pythia_era5_24/annual_means/'
########
zarr_path   = rda_scratch + "/tas_zarr/"
##########
print(era5_catalog)

https://data.rda.ucar.edu/pythia_era5_24/pythia_intake_catalogs/era5_catalog.json


## Spin up a PBS cluster

In [18]:
# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '8GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    log_directory = rda_scratch +'/dask/',
    resource_spec = 'select=1:ncpus=1:mem=8GB',
    queue = 'casper',
    walltime = '3:30:00',
    #interface = 'ib0'
    interface = 'ext'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40241 instead


In [19]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/40241/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/40241/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.94:39115,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/harshah/proxy/40241/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [20]:
cluster.scale(30)

## Find data using intake catalog 

In [21]:
era5_cat = intake.open_esm_datastore(era5_catalog)
era5_cat

  df = pd.read_csv(


Unnamed: 0,unique
era_id,1
datatype,2
level_type,1
step_type,7
table_code,4
param_code,164
variable,212
long_name,212
units,33
year,85


In [22]:
era5_cat.df[['long_name','variable']].drop_duplicates().head()

Unnamed: 0,long_name,variable
0,Potential vorticity,PV
31,Specific rain water content,CRWC
62,Specific snow water content,CSWC
93,Geopotential,Z
124,Temperature,T


### Select variable of interest

In [27]:
temp_cat = era5_cat.search(variable='VAR_2T',frequency = 'hourly')
# MTNLWRF = Outgoing Long Wave Radiation (upto a sign), Mean Top Net Long Wave Radiative Flux
#rh_cat = era5_cat.search(variable= 'R')
olr_cat = era5_cat.search(variable ='MTNLWRF')
olr_cat

Unnamed: 0,unique
era_id,1
datatype,1
level_type,1
step_type,1
table_code,1
param_code,1
variable,1
long_name,1
units,1
year,85


In [28]:
# Define the xarray_open_kwargs with a compatible engine, for example, 'scipy'
xarray_open_kwargs = {
    'engine': 'h5netcdf',
    'chunks': {},  # Specify any chunking if needed
    'backend_kwargs': {}  # Any additional backend arguments if required
}

In [29]:
%%time
#dsets = temp_cat.to_dataset_dict(xarray_open_kwargs=xarray_open_kwargs)
#dset_rh = rh_cat.to_dataset_dict(xarray_open_kwargs=xarray_open_kwargs)
dset_olr = olr_cat.to_dataset_dict(xarray_open_kwargs=xarray_open_kwargs)


--> The keys in the returned dictionary of datasets are constructed as follows:
	'datatype.step_type'


CPU times: user 1min 1s, sys: 5.2 s, total: 1min 7s
Wall time: 6min 22s


In [33]:
dset_olr.keys()

dict_keys(['fc.meanflux'])

In [35]:
# temp_2m = dsets['an.sfc'].VAR_2T
# temp_2m
#rh = 
olr =  dset_olr['fc.meanflux'].MTNLWRF
olr

Unnamed: 0,Array,Chunk
Bytes,2.78 TiB,47.53 MiB
Shape,"(61424, 12, 721, 1440)","(1, 12, 721, 1440)"
Dask graph,61424 chunks in 4037 graph layers,61424 chunks in 4037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.78 TiB 47.53 MiB Shape (61424, 12, 721, 1440) (1, 12, 721, 1440) Dask graph 61424 chunks in 4037 graph layers Data type float32 numpy.ndarray",61424  1  1440  721  12,

Unnamed: 0,Array,Chunk
Bytes,2.78 TiB,47.53 MiB
Shape,"(61424, 12, 721, 1440)","(1, 12, 721, 1440)"
Dask graph,61424 chunks in 4037 graph layers,61424 chunks in 4037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,239.94 kiB,128 B
Shape,"(61424,)","(32,)"
Dask graph,2018 chunks in 4037 graph layers,2018 chunks in 4037 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 239.94 kiB 128 B Shape (61424,) (32,) Dask graph 2018 chunks in 4037 graph layers Data type int32 numpy.ndarray",61424  1,

Unnamed: 0,Array,Chunk
Bytes,239.94 kiB,128 B
Shape,"(61424,)","(32,)"
Dask graph,2018 chunks in 4037 graph layers,2018 chunks in 4037 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [None]:
# temp_2m_annual = temp_2m.resample(time='1Y').mean()
# temp_2m_annual

In [None]:
temp_2m_monthly = xr.open_zarr(zarr_path + "e5_tas2m_monthly_1940_2023.zarr").VAR_2T
temp_2m_monthly

In [None]:
temp_2m_annual = temp_2m_monthly.resample(time='1Y').mean()
temp_2m_annual = temp_2m_annual.chunk({'latitude':721,'longitude':1440})
temp_2m_annual

#### Save annual mean to pythia_era5_24/annual_means folder within rda_data

In [None]:
# %%time
# temp_2m_annual.to_dataset().to_zarr(annual_means + 'temp_2m_annual_1940_2023.zarr',mode='w')

In [None]:
temp_2m_annual = xr.open_zarr(annual_means + 'temp_2m_annual_1940_2023.zarr').VAR_2T
temp_2m_annual

In [None]:
%%time
temp_2m_annual.isel(time=0).plot()

### Close up the cluster

In [None]:
# cluster.close()