In [1]:
import os
import dask
import xarray as xr
from tqdm import tqdm_notebook as tqdm

import xarray as xr
import numpy as np
from dask_jobqueue import PBSCluster
USER = os.environ['USER']
PROJECT = os.environ['PBS_ACCOUNT']

  return f(*args, **kwds)


In [2]:
cluster = PBSCluster(queue='economy', cores=36, processes=1, 
                     memory='80GB', project=PROJECT, walltime='03:00:00',
                    local_directory=f'/glade/scratch/{USER}/dask-tmp')

In [3]:
cluster.adapt(minimum=1, maximum=2)

<distributed.deploy.adaptive.Adaptive at 0x2aaade9d1be0>

In [4]:
!qstat -u $USER


chadmin1: 
                                                            Req'd  Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time  S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
3157478.chadmin abanihi  economy  STDIN       46216   1   1    --  04:00 R 00:16
3157628.chadmin abanihi  economy  dask-worke    --    1   1    --  03:00 Q   -- 


In [5]:
from dask.distributed import Client

In [6]:
client = Client(cluster)

In [7]:
dask.config.set({'distributed.dashboard.link':'http://localhost:{port}/status'})

<dask.config.set at 0x2aaadf2a5940>

In [8]:
client

0,1
Client  Scheduler: tcp://10.148.5.112:42012  Dashboard: http://localhost:8787/status,Cluster  Workers: 1  Cores: 36  Memory: 80.00 GB


In [10]:
from pathlib import Path
from glob import glob

In [26]:
root_dir = Path("/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS")

In [27]:
ls {root_dir}

[0m[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.040001-049912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.050001-059912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.060001-069912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.070001-079912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.080001-089912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.090001-099912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.100001-109912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.110001-119912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.120001-129912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.130001-139912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.140001-149912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.150001-159912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.160001-169912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.170001-179912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.180001-189912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam

## Case 1: `b.e11.BRCP85C5CNBDRD`

For this case, there are two time ranges for each ensemble:
- `2006-01 -> 2080-12`
- `2080-01 -> 2100-12`

In [90]:
CASE = 'b.e11.BRCP85C5CNBDRD.f09_g16'

In [91]:
list_1 = sorted(root_dir.glob("b.e11.BRCP85C5CNBDRD.f09_g16.???.cam.h0.*.200601-208012*"))
list_2 = sorted(root_dir.glob("b.e11.BRCP85C5CNBDRD.f09_g16.???.cam.h0.*.208101-210012*"))

We are going to read these files in three steps:

- Step 1: Map files in `list_1` and `list_2` for each ensemble in a list of tuples where each tuple contains files for each ensemble for the two time ranges.
- Step 2: Loop through the resulting list from step 1, and read those files into a list of datasets. Under the hood, xarray concatenates files for each ensemble in one dataset.
- Step 3: Concatenate list of datasets from step 2 into one xarray dataset. We concatenate these datasets along the `ensemble` dimension.

**Step 1**

Map files in `list_1` and `list_2` for each ensemble in a list of tuples where each tuple contains files for each ensemble for the two time ranges.

In [92]:
case_1 = list(zip(list_1, list_2))
case_1[0]

(PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.BRCP85C5CNBDRD.f09_g16.001.cam.h0.TS.200601-208012.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.BRCP85C5CNBDRD.f09_g16.001.cam.h0.TS.208101-210012.nc'))

In [93]:
len(case_1)

33

**Step 2**

Loop through the resulting list from step 1, and read those files into a list of datasets. Under the hood, xarray concatenates files for each ensemble in one dataset.

In [94]:
ds_list = [xr.open_mfdataset(item) for item in case_1]
ds_list[:2]

[<xarray.Dataset>
 Dimensions:       (ilev: 31, lat: 192, lev: 30, lon: 288, nbnd: 2, slat: 191, slon: 288, time: 1140)
 Coordinates:
   * ilev          (ilev) float64 2.255 5.032 10.16 18.56 ... 967.5 985.1 1e+03
   * lat           (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
   * lev           (lev) float64 3.643 7.595 14.36 24.61 ... 957.5 976.3 992.6
   * lon           (lon) float64 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
   * slat          (slat) float64 -89.53 -88.59 -87.64 ... 87.64 88.59 89.53
   * slon          (slon) float64 -0.625 0.625 1.875 3.125 ... 355.6 356.9 358.1
   * time          (time) datetime64[ns] 2006-02-01 2006-03-01 ... 2101-01-01
 Dimensions without coordinates: nbnd
 Data variables:
     P0            (time) float64 1e+05 1e+05 1e+05 1e+05 ... 1e+05 1e+05 1e+05
     TS            (time, lat, lon) float32 dask.array<shape=(1140, 192, 288), chunksize=(900, 192, 288)>
     ch4vmr        (time) float64 dask.array<shape=(1140,), chunksize=

**Step 3**

Concatenate list of datasets from step 2 into one xarray dataset. We concatenate these datasets along the `ensemble` dimension.

In [96]:
dset = xr.concat(ds_list, dim='ensemble').chunk({'ensemble': 1, 'time': 20})
dset.attrs['case'] = CASE
dset

<xarray.Dataset>
Dimensions:       (ensemble: 33, ilev: 31, lat: 192, lev: 30, lon: 288, nbnd: 2, slat: 191, slon: 288, time: 1140)
Coordinates:
  * ilev          (ilev) float64 2.255 5.032 10.16 18.56 ... 967.5 985.1 1e+03
  * lat           (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * lev           (lev) float64 3.643 7.595 14.36 24.61 ... 957.5 976.3 992.6
  * lon           (lon) float64 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
  * slat          (slat) float64 -89.53 -88.59 -87.64 ... 87.64 88.59 89.53
  * slon          (slon) float64 -0.625 0.625 1.875 3.125 ... 355.6 356.9 358.1
  * time          (time) datetime64[ns] 2006-02-01 2006-03-01 ... 2101-01-01
Dimensions without coordinates: ensemble, nbnd
Data variables:
    P0            (ensemble, time) float64 dask.array<shape=(33, 1140), chunksize=(1, 20)>
    TS            (ensemble, time, lat, lon) float32 dask.array<shape=(33, 1140, 192, 288), chunksize=(1, 20, 192, 288)>
    ch4vmr        (ensemble, tim

In [97]:
output = f"/glade/scratch/abanihi/data/AWS/lens/{CASE}.zarr"

In [98]:
%time dset.to_zarr(output, mode='w')

CPU times: user 2min 30s, sys: 3.42 s, total: 2min 34s
Wall time: 3min 55s


<xarray.backends.zarr.ZarrStore at 0x2aab04a75e80>

In [99]:
dset_zarr = xr.open_zarr(output)

In [110]:
dset_zarr

<xarray.Dataset>
Dimensions:       (ensemble: 33, ilev: 31, lat: 192, lev: 30, lon: 288, nbnd: 2, slat: 191, slon: 288, time: 1140)
Coordinates:
  * ilev          (ilev) float64 2.255 5.032 10.16 18.56 ... 967.5 985.1 1e+03
  * lat           (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * lev           (lev) float64 3.643 7.595 14.36 24.61 ... 957.5 976.3 992.6
  * lon           (lon) float64 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
  * slat          (slat) float64 -89.53 -88.59 -87.64 ... 87.64 88.59 89.53
  * slon          (slon) float64 -0.625 0.625 1.875 3.125 ... 355.6 356.9 358.1
  * time          (time) datetime64[ns] 2006-02-01 2006-03-01 ... 2101-01-01
Dimensions without coordinates: ensemble, nbnd
Data variables:
    P0            (ensemble, time) float64 dask.array<shape=(33, 1140), chunksize=(1, 20)>
    TS            (ensemble, time, lat, lon) float32 dask.array<shape=(33, 1140, 192, 288), chunksize=(1, 20, 192, 288)>
    ch4vmr        (ensemble, tim

In [100]:
print('dataset size in GB {:0.2f}\n'.format(dset_zarr.nbytes / 1e9))

dataset size in GB 8.54



In [101]:
!du -s /glade/scratch/abanihi/data/AWS/lens/b.e11.BRCP85C5CNBDRD.f09_g16.zarr/ -h

4.7G	/glade/scratch/abanihi/data/AWS/lens/b.e11.BRCP85C5CNBDRD.f09_g16.zarr/


In [109]:
!du  -ach /glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.BRCP85C5CNBDRD.f09_g16.???.cam.h0.* -h | tail -1 | cut -f 1

5.7G
