In [1]:
import os
import dask
import xarray as xr
from tqdm import tqdm_notebook as tqdm

import xarray as xr
import numpy as np
from dask_jobqueue import PBSCluster
USER = os.environ['USER']
PROJECT = os.environ['PBS_ACCOUNT']

In [2]:
cluster = PBSCluster(queue='economy', cores=36, processes=1, 
                     memory='80GB', project=PROJECT, walltime='03:30:00',
                    local_directory=f'/glade/scratch/{USER}/dask-tmp')

In [3]:
cluster.adapt(minimum=1, maximum=2)

<distributed.deploy.adaptive.Adaptive at 0x2aaade707dd8>

In [4]:
!qstat -u $USER


chadmin1: 
                                                            Req'd  Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time  S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
3200751.chadmin abanihi  economy  STDIN       31315   1   1    --  04:00 R 00:01
3200977.chadmin abanihi  economy  dask-worke    --    1   1    --  03:30 Q   -- 


In [5]:
from dask.distributed import Client

In [6]:
client = Client(cluster)

In [7]:
dask.config.set({'distributed.dashboard.link':'http://localhost:{port}/status'})

<dask.config.set at 0x2aaadeb242e8>

In [8]:
client

0,1
Client  Scheduler: tcp://10.148.12.67:49086  Dashboard: http://localhost:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [9]:
from pathlib import Path
from glob import glob

In [10]:
root_dir = Path("/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS")

In [11]:
ls {root_dir}

[0m[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.040001-049912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.050001-059912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.060001-069912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.070001-079912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.080001-089912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.090001-099912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.100001-109912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.110001-119912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.120001-129912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.130001-139912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.140001-149912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.150001-159912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.160001-169912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.170001-179912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam.h0.TS.180001-189912.nc[0m
[00mb.e11.B1850C5CN.f09_g16.005.cam

## Case: `b.e11.B20TRC5CNBDRD`

In [12]:
CASE = 'b.e11.B20TRC5CNBDRD.f09_g16'

In [25]:
list_1 = sorted(root_dir.glob("b.e11.B20TRC5CNBDRD.f09_g16.???.cam.h0.*"))
list_1

[PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.001.cam.h0.TS.185001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.002.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.003.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.004.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.005.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.006.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.007.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/ce

In [26]:
len(list_1)

42

In [27]:
indices = 0, 33, 34 # indices of special runs to remove for the original list. These runs' output have additional ouput, and/or have special time ranges

In [28]:
updated_list = [item for index, item in enumerate(list_1) if index not in indices]
updated_list

[PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.002.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.003.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.004.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.005.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.006.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.007.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/cesmLE/CESM-CAM5-BGC-LE/atm/proc/tseries/monthly/TS/b.e11.B20TRC5CNBDRD.f09_g16.008.cam.h0.TS.192001-200512.nc'),
 PosixPath('/glade/p_old/ce

**Step 2**

Loop through the resulting list from step 1, and read those files into a list of datasets. Under the hood, xarray concatenates files for each ensemble in one dataset.

In [29]:
dset = xr.open_mfdataset(updated_list, concat_dim='ensemble')
dset

<xarray.Dataset>
Dimensions:       (ensemble: 39, ilev: 31, lat: 289, lev: 30, lon: 288, nbnd: 2, slat: 256, slon: 288, time: 1032)
Coordinates:
  * lat           (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * slat          (slat) float64 -89.53 -88.59 -87.64 ... 87.64 88.59 89.53
  * ilev          (ilev) float64 2.255 5.032 10.16 18.56 ... 967.5 985.1 1e+03
  * lev           (lev) float64 3.643 7.595 14.36 24.61 ... 957.5 976.3 992.6
  * lon           (lon) float64 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
  * slon          (slon) float64 -0.625 0.625 1.875 3.125 ... 355.6 356.9 358.1
  * time          (time) datetime64[ns] 1920-02-01 1920-03-01 ... 2006-01-01
Dimensions without coordinates: ensemble, nbnd
Data variables:
    P0            (ensemble) float64 1e+05 1e+05 1e+05 ... 1e+05 1e+05 1e+05
    TS            (ensemble, time, lat, lon) float32 dask.array<shape=(39, 1032, 289, 288), chunksize=(1, 1032, 289, 288)>
    ch4vmr        (ensemble, time) float64 d

**Step 3**

Concatenate list of datasets from step 2 into one xarray dataset. We concatenate these datasets along the `ensemble` dimension.

In [30]:
dset = dset.chunk({'ensemble': 1, 'time': 20})
dset.attrs['case'] = CASE
dset

<xarray.Dataset>
Dimensions:       (ensemble: 39, ilev: 31, lat: 289, lev: 30, lon: 288, nbnd: 2, slat: 256, slon: 288, time: 1032)
Coordinates:
  * lat           (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * slat          (slat) float64 -89.53 -88.59 -87.64 ... 87.64 88.59 89.53
  * ilev          (ilev) float64 2.255 5.032 10.16 18.56 ... 967.5 985.1 1e+03
  * lev           (lev) float64 3.643 7.595 14.36 24.61 ... 957.5 976.3 992.6
  * lon           (lon) float64 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
  * slon          (slon) float64 -0.625 0.625 1.875 3.125 ... 355.6 356.9 358.1
  * time          (time) datetime64[ns] 1920-02-01 1920-03-01 ... 2006-01-01
Dimensions without coordinates: ensemble, nbnd
Data variables:
    P0            (ensemble) float64 dask.array<shape=(39,), chunksize=(1,)>
    TS            (ensemble, time, lat, lon) float32 dask.array<shape=(39, 1032, 289, 288), chunksize=(1, 20, 289, 288)>
    ch4vmr        (ensemble, time) float64 das

In [31]:
output = f"/glade/scratch/abanihi/data/AWS/lens/{CASE}.zarr"

In [32]:
%time dset.to_zarr(output, mode='w')

  dataset.dump_to_store(store, sync=True, encoding=encoding, compute=compute)


CPU times: user 59.9 s, sys: 1.1 s, total: 1min
Wall time: 2min 46s


<xarray.backends.zarr.ZarrStore at 0x2aaaf9e2efd0>

In [33]:
dset_zarr = xr.open_zarr(output)

In [34]:
dset_zarr

<xarray.Dataset>
Dimensions:       (ensemble: 39, ilev: 31, lat: 289, lev: 30, lon: 288, nbnd: 2, slat: 256, slon: 288, time: 1032)
Coordinates:
  * ilev          (ilev) float64 2.255 5.032 10.16 18.56 ... 967.5 985.1 1e+03
  * lat           (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * lev           (lev) float64 3.643 7.595 14.36 24.61 ... 957.5 976.3 992.6
  * lon           (lon) float64 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8
  * slat          (slat) float64 -89.53 -88.59 -87.64 ... 87.64 88.59 89.53
  * slon          (slon) float64 -0.625 0.625 1.875 3.125 ... 355.6 356.9 358.1
  * time          (time) datetime64[ns] 1920-02-01 1920-03-01 ... 2006-01-01
Dimensions without coordinates: ensemble, nbnd
Data variables:
    P0            (ensemble) float64 dask.array<shape=(39,), chunksize=(1,)>
    TS            (ensemble, time, lat, lon) float32 dask.array<shape=(39, 1032, 289, 288), chunksize=(1, 20, 289, 288)>
    ch4vmr        (ensemble, time) float64 das

In [100]:
print('dataset size in GB {:0.2f}\n'.format(dset_zarr.nbytes / 1e9))

dataset size in GB 8.54



In [35]:
!du -s /glade/scratch/abanihi/data/AWS/lens/b.e11.B20TRC5CNBDRD.f09_g16.zarr/ -h

5.1G	/glade/scratch/abanihi/data/AWS/lens/b.e11.B20TRC5CNBDRD.f09_g16.zarr/
