# zarr-ification of CESM-LE for AWS

In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import shutil

import yaml

import xarray as xr

import intake
from distributed.utils import format_bytes
from operator import mul
from functools import reduce

## Spinup cluster

In [2]:
from ncar_jobqueue import NCARCluster
cluster = NCARCluster(memory="70GB")
cluster

VBox(children=(HTML(value='<h2>NCARCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [3]:
print(cluster.job_script())

#!/bin/bash

#!/usr/bin/env bash
#SBATCH -J dask-worker
#SBATCH -A NIOW0001
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=66G
#SBATCH -t 06:00:00
#SBATCH -C skylake
#SBATCH -C casper
JOB_ID=${SLURM_JOB_ID%;*}



/glade/work/abanihi/softwares/miniconda3/envs/analysis/bin/python -m distributed.cli.dask_worker tcp://10.12.203.5:46034 --nthreads 1 --memory-limit 70.00GB --name dask-worker--${JOB_ID}-- --death-timeout 60 --interface ib0



In [4]:
cluster.adapt(minimum=20, wait_count=60)

<distributed.deploy.adaptive.Adaptive at 0x2b95551780f0>

In [5]:
from dask.distributed import Client
client = Client(cluster) # Connect this local process to remote workers
client

0,1
Client  Scheduler: tcp://10.12.203.5:46034  Dashboard: https://jupyterhub.ucar.edu/ch/user/abanihi/proxy/8787/status,Cluster  Workers: 33  Cores: 33  Memory: 2.31 TB


## Connect to intake-esm database

In [6]:
col = intake.open_esm_metadatastore(collection_name='CESM1-LE')
col.df.head()

Unnamed: 0,resource,resource_type,direct_access,experiment,case,component,stream,variable,date_range,member_id,file_fullpath,file_basename,file_dirname,ctrl_branch_year,year_offset,sequence_order,has_ocean_bgc,grid
0,CTRL:GLADE:posix:/glade/collections/cdg/data/c...,posix,True,CTRL,b.e11.B1850C5CN.f09_g16.005,atm,cam.h1,T200,18000101-18991231,0,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,b.e11.B1850C5CN.f09_g16.005.cam.h1.T200.180001...,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,,1448.0,0,True,
1,CTRL:GLADE:posix:/glade/collections/cdg/data/c...,posix,True,CTRL,b.e11.B1850C5CN.f09_g16.005,atm,cam.h1,T200,08000101-08991231,0,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,b.e11.B1850C5CN.f09_g16.005.cam.h1.T200.080001...,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,,1448.0,0,True,
2,CTRL:GLADE:posix:/glade/collections/cdg/data/c...,posix,True,CTRL,b.e11.B1850C5CN.f09_g16.005,atm,cam.h1,T200,04020101-04991231,0,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,b.e11.B1850C5CN.f09_g16.005.cam.h1.T200.040201...,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,,1448.0,0,True,
3,CTRL:GLADE:posix:/glade/collections/cdg/data/c...,posix,True,CTRL,b.e11.B1850C5CN.f09_g16.005,atm,cam.h1,T200,10000101-10991231,0,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,b.e11.B1850C5CN.f09_g16.005.cam.h1.T200.100001...,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,,1448.0,0,True,
4,CTRL:GLADE:posix:/glade/collections/cdg/data/c...,posix,True,CTRL,b.e11.B1850C5CN.f09_g16.005,atm,cam.h1,T200,11000101-11991231,0,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,b.e11.B1850C5CN.f09_g16.005.cam.h1.T200.110001...,/glade/collections/cdg/data/cesmLE/CESM-CAM5-B...,,1448.0,0,True,


## Helper functions

In [7]:
def collection_summary(col):
    df = col.df
    print("There are:\n")
    print(f"\t1) {df.experiment.nunique()} experiments: \n\n\t\t{df.experiment.unique()}\n")
    print(f"\t2) {df.case.nunique()} cases: \n\n\t\t{df.case.unique()[:5]} etc...\n")
    print(f"\t3) {df.component.nunique()} components: \n\n\t\t{df.component.unique()}\n")
    print(f"\t4) {df.stream.nunique()} streams: \n\n\t\t{df.stream.unique()}\n")
    print(f"\t5) {df.variable.nunique()} variables: \n\n\t\t{df.variable.unique()[:10]} etc...\n")
    

In [8]:
def print_query_info(query_results):
    variables = sorted(query_results.variable.unique().tolist())
    files = sorted(query_results.file_fullpath.unique().tolist())
    date_ranges = sorted(query_results.date_range.unique())
    cases = sorted(query_results.case.unique())
    members = sorted(query_results.member_id.unique())
    print(f"This query: {query} returned:\n\n- {len(variables)} unique variable(s)\n\n")
    print(f"- Number of files: {len(files)}\n\n")
    print(f"- {len(members)} member(s): {members}\n\n")
    #print(f"Date ranges: {date_ranges}\n\n")
    print(f"- {len(cases)} case(s)\n\n")
    

In [9]:
# file namer
dirout = '/glade/scratch/abanihi/lens-aws'
def zarr_file(exp, cmp, frequency, var, write=False):
    path = f'{dirout}/cesmLE-{exp}-{cmp}-{frequency}-{var}.zarr'    
    if write and os.path.exists(path):
        shutil.rmtree(path)
    print(path)
    return path     

In [10]:
def read_data(cat, xr_open, member_chunks):
    dd = cat.to_xarray(**xr_open)
    _, ds = dd.popitem()
    return ds.chunk(member_chunks)

def print_ds_info(ds, var):
    dt = ds[var].dtype
    itemsize = dt.itemsize
    chunk_size = ds[var].data.chunksize
    size = format_bytes(ds.nbytes)
    _bytes = reduce(mul, chunk_size) * itemsize
    chunk_size_bytes = format_bytes(_bytes)
    
    print(f"Variable name: {var}")
    print(f"Chunk shape: {chunk_size}")
    print(f"Dataset shape: {ds[var].shape}")
    print(f"Chunk size: {chunk_size_bytes}")
    print(f"Dataset size: {size}")
    print(f"Chunks: {ds[var].chunks}")

In [11]:
collection_summary(col)

There are:

	1) 6 experiments: 

		['CTRL' 'CTRL-AMIP' 'CTRL-SLAB-OCN' '20C' '20C-OIC' 'RCP85']

	2) 93 cases: 

		['b.e11.B1850C5CN.f09_g16.005' 'f.e11.F1850C5CN.f09_f09.001'
 'e.e11.E1850C5CN.f09_g16.001' 'b.e11.B20TRC5CNBDRD.f09_g16.105'
 'b.e11.B20TRC5CNBDRD.f09_g16.104'] etc...

	3) 5 components: 

		['atm' 'ocn' 'lnd' 'rof' 'ice']

	4) 10 streams: 

		['cam.h1' 'cam.h0' 'pop.h.nday1' 'pop.h' 'clm2.h1' 'clm2.h0' 'rtm.h1'
 'rtm.h0' 'cice.h1' 'cice.h']

	5) 985 variables: 

		['T200' 'Q200' 'FLNSC' 'Q500' 'PRECL' 'QBOT' 'TREFHTMN' 'FSNTOA' 'TAUX'
 'TAUY'] etc...



## Load variable/chunking specs

In [12]:
with open("../variables_chunking_spec.yaml") as f:
    specs = yaml.safe_load(f)
specs

{'ocn': {'pop.h': {'variable_category': {'2D': {'variables': ['SST', 'SSH'],
     'chunks': {'time': 12, 'member_id': 40}},
    '3D': {'variables': ['SALT'],
     'chunks': {'time': 12, 'z_t': 1, 'member_id': 40}}},
   'frequency': 'monthly'}},
 'ice': {'cice.h': {'variable_category': {'2D': {'variables': ['aice_nh',
      'aice_sh',
      'hi_nh',
      'hi_sh'],
     'chunks': {'time': 12}}},
   'frequency': 'monthly'}},
 'atm': {'cam.h0': {'variable_category': {'2D': {'variables': ['T',
      'U',
      'v',
      'Q',
      'Z3'],
     'chunks': {'time': 12}},
    '3D': {'variables': ['FLNS', 'FLNSC'], 'chunks': {'time': 12}}},
   'frequency': 'monthly'}},
 'lnd': {'clm2.h0': {'variable_category': {'3D_2D': {'variables': ['FSNO',
      'H2OSNO',
      'QRUNOFF',
      'RAIN',
      'SNOW',
      'SOILWATER_10CM',
      'SOILLIQ'],
     'chunks': {'time': 12, 'levgrnd': 1, 'levlak': 1}}},
   'frequency': 'monthly'}}}

## Process `land` component (realm)

In [13]:
component_list = ['lnd']

experiment_list = ['20C', 'RCP85']


In [None]:
for exp in experiment_list:
    print("="*120)
    for cmp in component_list:
        print("#"*120)
        specs_ = specs.copy()
        for stm, stm_specs in specs_[cmp].items():
            frequency = stm_specs['frequency']
            var_cats = stm_specs['variable_category']
            for var_cat, var_cat_values in var_cats.items():
                variables = var_cat_values['variables']
                chunks = var_cat_values['chunks'].copy()
                member_chunks = {}
                member_chunks['member_id'] = chunks.pop('member_id', 40)
                xr_open = dict(chunks=chunks, decode_times=False, decode_coords=False)
                for variable in variables:
                    query = dict(experiment=exp, component=cmp, stream=stm, variable=variable)
                    cat = col.search(**query)
                    results = cat.query_results
                    if not results.empty:
                        print("*"*100)
                        print_query_info(results)
                        try:
                            path = zarr_file(exp, cmp, frequency, variable, write=True)
                            ds = read_data(cat, xr_open, member_chunks)
                            print("+"*35)
                            if variable in {'aice_nh', 'aice_sh'}:
                                variable = 'aice'
                            elif variable in {'hi_nh', 'hi_sh'}:
                                variable = 'hi'
                            print_ds_info(ds, variable)
                            print("+"*35)
                            print(ds)
                            ds.to_zarr(path)       
                            #print("*"*100)
                        except Exception as e:
                            raise(e)
                    else:
                        print(f"No results found for query={query}")
                        continue


########################################################################################################################
****************************************************************************************************
This query: {'experiment': '20C', 'component': 'lnd', 'stream': 'clm2.h0', 'variable': 'FSNO'} returned:

- 1 unique variable(s)


- Number of files: 40


- 40 member(s): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 101, 102, 103, 104, 105]


- 40 case(s)


/glade/scratch/abanihi/lens-aws/cesmLE-20C-lnd-monthly-FSNO.zarr


HBox(children=(IntProgress(value=0, description='dataset', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='member', max=40, style=ProgressStyle(description_width='initi…


+++++++++++++++++++++++++++++++++++
Variable name: FSNO
Chunk shape: (40, 12, 192, 288)
Dataset shape: (40, 1872, 192, 288)
Chunk size: 106.17 MB
Dataset size: 17.43 GB
Chunks: ((40,), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (192,), (288,))
+++++++++++++++++++++++++++++++++++
<xarray.Dataset>
Dimensions:       (hist_interval: 2, lat: 192, levgrnd: 15, levlak: 10, lon: 288, member_id: 40, time: 1872)
Coordinate