# Setting up a DASK cluster using dask-jobqueue

In [None]:
# get the plot in notebook
%matplotlib inline

In [None]:
import xarray as xr

In [None]:
# import the client, this is going to be our interface to see what happens on the DASK cluster
from dask.distributed import Client

In [None]:
# SLURMCluster will allow us to submit a job to the SLURM batch scheduler
# that will run the DASK cluster
from dask_jobqueue import SLURMCluster

In [None]:
# Set up SLURM options
cluster = SLURMCluster(queue='analysis', cores=8, project='gfdl_o', memory="96GB")
# submit the job for N=1 nodes (this is the number of workers, not cores)
cluster.scale(1)
# connect the client side to cluster
client = Client(cluster)

In [None]:
# Click the dashboard link to see our DASK cluster at work
client

In [None]:
# Open a dataset with xarray, mfdataset allows to open multiple files and append to dataset
datadir = '/data_cmip6/CMIP6/OMIP/NOAA-GFDL/GFDL-CM4/omip1/r1i1p1f1/Omon/thetao/gr/v20180701/'
ds = xr.open_mfdataset(datadir + '*.nc', chunks={'time': 1})

In [None]:
# Compute the time average. This is done lazyly so it is very quick. At this point no computations have been performed.
temp_mean = ds['thetao'].mean(dim=['time'])

In [None]:
# We can see the metadata of our time-mean and subset
temp_mean.sel(lev=2.5)

In [None]:
# asking for plot or numerical values will trigger the computation. Time to check out the DASK dashboard
%time temp_mean.sel(lev=2.5).plot()

In [None]:
# once finished, released the nodes by shutting down the cluster
cluster.close()