# Script to Extract Data
**Input Data:** Original Data  
**Output Data:** Data for a specific variable and spatial extent  
**Description:** Extracts data for a specific variable and spatial extent and exports them to a new file.  
**Date:** June 2022  
**Author:** Emma Perkins  
**Updated:** April 2023 by Teagan King  
**Note:** It is recommended to run this notebook with 50GB memory allocation.

In [1]:
# import relevant packages
import xarray as xr
import glob

In [2]:
# parameters for dask optimization
nworkers = 20
chunk_time = 73
chunk_lat = 100
chunk_lon = 100
nmem='30'

In [3]:
# Import dask
import dask

# Use dask jobqueue
from dask_jobqueue import PBSCluster

# Import a client
from dask.distributed import Client

# Setup your PBSCluster
cluster = PBSCluster(
    cores=1, # The number of cores you want
    memory=nmem+'GiB', # Amount of memory
    processes=1, # How many processes
    queue='casper', # The type of queue to utilize (/glade/u/apps/dav/opt/usr/bin/execcasper)
    local_directory='/glade/scratch/$USER/local_dask', # Use your local directory
    resource_spec='select=1:ncpus=1:mem='+nmem+'GB', # Specify resources
    account='P93300065', # Input your project ID here, previously this was known as 'project', now is 'account'
    walltime='04:00:00', # Amount of wall time
    interface='ib0', # Interface to use
)

# Scale up
cluster.scale(nworkers)

# Change your url to the dask dashboard so you can see it
dask.config.set({'distributed.dashboard.link':'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/proxy/{port}/status'})

# Setup your client
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/tking/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/tking/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.12.206.35:39790,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/tking/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


### Load Full Data

In [5]:
# full data
paths = '/glade/campaign/cesm/collections/cesmLE/CESM-CAM5-BGC-LE/lnd/proc/tseries/daily/H2OSNO/'  # change to your paths
file1 = paths+'b.e11.B1850C5CN.f09_g16.005.clm2.h1.H2OSNO.19000101-19991231.nc'
file2 = paths+'b.e11.B1850C5CN.f09_g16.005.clm2.h1.H2OSNO.20000101-20991231.nc'
# files = sorted(glob.glob(paths+names))
files = [file1, file2]
full_data = xr.open_mfdataset(files, concat_dim=None, parallel=True, chunks={"time":chunk_time, "lat":chunk_lat, "lon":chunk_lon})


In [6]:
full_data

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 285.16 kiB 292 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type int32 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 285.16 kiB 292 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type int32 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 285.16 kiB 292 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type int32 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 285.16 kiB 292 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type int32 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 285.16 kiB 292 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type int32 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,285.16 kiB,292 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.11 MiB,1.14 kiB
Shape,"(73000, 2)","(73, 2)"
Count,2002 Tasks,1000 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 1.11 MiB 1.14 kiB Shape (73000, 2) (73, 2) Count 2002 Tasks 1000 Chunks Type object numpy.ndarray",2  73000,

Unnamed: 0,Array,Chunk
Bytes,1.11 MiB,1.14 kiB
Shape,"(73000, 2)","(73, 2)"
Count,2002 Tasks,1000 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,570.31 kiB,584 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,|S8,numpy.ndarray
"Array Chunk Bytes 570.31 kiB 584 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type |S8 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,570.31 kiB,584 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,|S8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,570.31 kiB,584 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,|S8,numpy.ndarray
"Array Chunk Bytes 570.31 kiB 584 B Shape (73000,) (73,) Count 2002 Tasks 1000 Chunks Type |S8 numpy.ndarray",73000  1,

Unnamed: 0,Array,Chunk
Bytes,570.31 kiB,584 B
Shape,"(73000,)","(73,)"
Count,2002 Tasks,1000 Chunks
Type,|S8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,1.36 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.04 GiB 1.36 GiB Shape (73000, 192, 288) (36500, 100, 100) Count 38 Tasks 12 Chunks Type float32 numpy.ndarray",288  192  73000,

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,1.36 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,1.36 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.04 GiB 1.36 GiB Shape (73000, 192, 288) (36500, 100, 100) Count 38 Tasks 12 Chunks Type float32 numpy.ndarray",288  192  73000,

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,1.36 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,1.36 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.04 GiB 1.36 GiB Shape (73000, 192, 288) (36500, 100, 100) Count 38 Tasks 12 Chunks Type float32 numpy.ndarray",288  192  73000,

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,1.36 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.08 GiB,2.72 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 30.08 GiB 2.72 GiB Shape (73000, 192, 288) (36500, 100, 100) Count 38 Tasks 12 Chunks Type float64 numpy.ndarray",288  192  73000,

Unnamed: 0,Array,Chunk
Bytes,30.08 GiB,2.72 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.08 GiB,2.72 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 30.08 GiB 2.72 GiB Shape (73000, 192, 288) (36500, 100, 100) Count 38 Tasks 12 Chunks Type float64 numpy.ndarray",288  192  73000,

Unnamed: 0,Array,Chunk
Bytes,30.08 GiB,2.72 GiB
Shape,"(73000, 192, 288)","(36500, 100, 100)"
Count,38 Tasks,12 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,2.78 MiB
Shape,"(73000, 192, 288)","(73, 100, 100)"
Count,12002 Tasks,6000 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 15.04 GiB 2.78 MiB Shape (73000, 192, 288) (73, 100, 100) Count 12002 Tasks 6000 Chunks Type float32 numpy.ndarray",288  192  73000,

Unnamed: 0,Array,Chunk
Bytes,15.04 GiB,2.78 MiB
Shape,"(73000, 192, 288)","(73, 100, 100)"
Count,12002 Tasks,6000 Chunks
Type,float32,numpy.ndarray


### Select Variable of Interest

In [7]:
data_var = 'H2OSNO'  # change to variable of interest from climate model data
data_select = full_data[data_var]

### Select Area of Interest

In [8]:
%%time

# determine variable names
lat_var = 'lat'  # name of latitude variable for original data
lon_var = 'lon'  # name of longitude variable for original data

# rename lat lon variables to all be lat lon
data_select = data_select.rename({lat_var: 'lat', lon_var: 'lon'})

# sort by latitude:
data_select = data_select.sortby('lat')

lon_type = 'long3'  # longitude coordinate type: long1 (-180 - 180) or long 3 (0 - 360)
if lon_type == 'long3':
    lon_new = (data_select.lon + 180) % 360 - 180
    data_select['lon'] = lon_new
data_select = data_select.sortby('lon')

# select input area from left to right / west to east:
lat_min = 50  # minimum latitude
lat_max = 90  # maximum latitude
lon_min = 150  # minimum longitude
lon_max = -100  # maximum longitude

if lon_min < lon_max:
    data_select = data_select.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max))
else:
    data_select1 = data_select.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, 180))
    data_select2 = data_select.sel(lat=slice(lat_min, lat_max), lon=slice(-180, lon_max))
    data_select = xr.concat([data_select1, data_select2], dim='lon')
data_select = data_select.sortby('lon')

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


CPU times: user 350 ms, sys: 3.39 ms, total: 354 ms
Wall time: 360 ms


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


### Standardize Time Step
Can skip if already using the desired time step.

In [9]:
%%time

analysis_time_type = '1D'  # time step for analysis (ex: 6H, 1D, 1M, 1Y, etc.)

# choose either .sum() or .mean() for accumulation or instantaneous variables respectively
data_select = data_select.resample(time=analysis_time_type).mean('time')

CPU times: user 1min 58s, sys: 4.44 s, total: 2min 3s
Wall time: 2min 10s


### Export Data (~15 minutes)

In [10]:
data_select

Unnamed: 0,Array,Chunk
Bytes,1.04 GiB,9.41 kiB
Shape,"(73000, 43, 89)","(1, 43, 56)"
Count,905002 Tasks,219000 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.04 GiB 9.41 kiB Shape (73000, 43, 89) (1, 43, 56) Count 905002 Tasks 219000 Chunks Type float32 numpy.ndarray",89  43  73000,

Unnamed: 0,Array,Chunk
Bytes,1.04 GiB,9.41 kiB
Shape,"(73000, 43, 89)","(1, 43, 56)"
Count,905002 Tasks,219000 Chunks
Type,float32,numpy.ndarray


In [11]:
%%time

outpath = '/glade/scratch/tking/cesm/'  # path for new observational data file
data_name = 'cesmLE_B1850C5CN_H2OSNO_1900_2099_1D_MRBplus'  # name for new data file

data_select.load().to_netcdf(outpath+data_name+'.nc')

CPU times: user 13min 8s, sys: 18.8 s, total: 13min 26s
Wall time: 14min 3s
