# Script to Extract Climate Model and Observational Data for Comparison
**Input Data:** Climate and Observational Data  
**Output Data:** Climate and Observational data for a specific variable and spatial extent  
**Description:** Extracts data for a specific variable and spatial extent and exports them to a new file.  
**Date:** June 2022  
**Creator:** Emma Perkins  

In [1]:
# import relevant packages
import xarray as xr
import glob
import matplotlib as plt
import numpy as np
import cartopy.crs as ccrs

### Load Full Data

In [2]:
# full observational data
obs_paths = '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/*/'  # change to your paths
obs_names = 'e5.oper.an.sfc.128_167_2t.ll025sc.*.nc'  # change to your files, currently reading in ERA5 2-meter air temp
obs_files = sorted(glob.glob(obs_paths+obs_names))
obs_data = xr.open_mfdataset(obs_files, concat_dim=None) 

# full climate model data
clim_path = '/glade/campaign/cesm/collections/cesmLE/CESM-CAM5-BGC-LE/ice/proc/tseries/daily/hs_d/'  # change to your paths
clim_file1 = clim_path+'b.e11.B1850C5CN.f09_g16.005.cice.h1.hs_d_nh.19000101-19991231.nc'  # change to your files
clim_file2 = clim_path+'b.e11.B1850C5CN.f09_g16.005.cice.h1.hs_d_nh.20000101-20991231.nc'
clim_files = [clim_file1, clim_file2]
clim_data = xr.open_mfdataset(clim_files, concat_dim=None)

### Select Variable of Interest

In [3]:
clim_var = 'hs_d'  # change to variable of interest from climate model data
clim_select = clim_data[clim_var]

obs_var = 'VAR_2T'  # change to variable of interest from observational data
obs_select = obs_data[obs_var]

### Modify Lat Lon Coordinates

In [4]:
print(clim_select)

<xarray.DataArray 'hs_d' (time: 73000, nj: 104, ni: 320)>
dask.array<concatenate, shape=(73000, 104, 320), dtype=float32, chunksize=(36500, 104, 320), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) object 1900-01-02 00:00:00 ... 2100-01-01 00:00:00
    TLON     (nj, ni) float32 dask.array<chunksize=(104, 320), meta=np.ndarray>
    TLAT     (nj, ni) float32 dask.array<chunksize=(104, 320), meta=np.ndarray>
    ULON     (nj, ni) float32 dask.array<chunksize=(104, 320), meta=np.ndarray>
    ULAT     (nj, ni) float32 dask.array<chunksize=(104, 320), meta=np.ndarray>
Dimensions without coordinates: nj, ni
Attributes:
    units:          m
    long_name:      grid cell mean snow thickness
    cell_measures:  area: tarea
    comment:        snow volume per unit grid cell area
    cell_methods:   time: mean
    time_rep:       averaged


In [5]:
cmap1 = plt.cm.viridis
lev1 = np.arange(0,3.001,0.01)
ax1 = plt.axes(projection=ccrs.PlateCarree())
clim_select.isel(time=0).plot.contourf(cmap = cmap1, levels = lev1, transform=ccrs.PlateCarree())
ax1.coastlines()

TypeError: 'module' object is not callable

In [None]:
float(clim_select.isel(time=0)[75][75])

### Select Area of Interest

In [5]:
# determine variable names
obs_lat_var = 'latitude'  # name of latitude variable for observational data
clim_lat_var = 'TLAT'  # name of latitude variable for climate data
obs_lon_var = 'longitude'  # name of longitude variable for observational data
clim_lon_var = 'TLON'  # name of longitude variable for climate model data

# rename lat lon variables to all be lat lon
obs_select = obs_select.rename({obs_lat_var: 'lat', obs_lon_var: 'lon'})
clim_select = clim_select.rename({clim_lat_var: 'lat', clim_lon_var: 'lon'})

# sort by latitude:
obs_select = obs_select.sortby('lat')
clim_select = clim_select.sortby('lat')

obs_lon_type = 'long3'  # observational longitude coordinate type (long1 or long3)
clim_lon_type = 'long3'  # climate model longitude coordinate type (long1 or long3)
if clim_lon_type == 'long3':
    clim_lon_new = (clim_select.lon + 180) % 360 - 180
    clim_select['lon'] = clim_lon_new
if obs_lon_type == 'long3':
    obs_lon_new = (obs_select.lon + 180) % 360 - 180
    obs_select['lon'] = obs_lon_new
clim_select = clim_select.sortby('lon')
obs_select = obs_select.sortby('lon')

# select input area from left to right / west to east:
lat_min = 50  # minimum latitude
lat_max = 90  # maximum latitude
lon_min = 150  # minimum longitude
lon_max = -100  # maximum longitude

if lon_min < lon_max:
    obs_select = obs_select.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max))
    clim_select = clim_select.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max))
else:
    obs_select1 = obs_select.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, 180))
    obs_select2 = obs_select.sel(lat=slice(lat_min, lat_max), lon=slice(-180, lon_max))
    obs_select = xr.concat([obs_select1, obs_select2], dim='lon')

    clim_select1 = clim_select.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, 180))
    clim_select2 = clim_select.sel(lat=slice(lat_min, lat_max), lon=slice(-180, lon_max))
    clim_select = xr.concat([clim_select1, clim_select2], dim='lon')
clim_select = clim_select.sortby('lon')
obs_select = obs_select.sortby('lon')

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the

### Standardize Time Step

In [7]:
%%time

analysis_time_type = '1D'  # time step for analysis (ex: 3H, 1D, 1M, 1Y, etc.)

obs_select = obs_select.resample(time=analysis_time_type).sum('time')
clim_select = clim_select.resample(time=analysis_time_type).sum('time')

CPU times: user 2min 50s, sys: 2min 27s, total: 5min 17s
Wall time: 5min 58s


In [4]:
clim_select = clim_select * 86400
clim_select = clim_select.rename('TP')

In [5]:
print(clim_select)

<xarray.DataArray 'TP' (time: 73000, lat: 192, lon: 288)>
dask.array<mul, shape=(73000, 192, 288), dtype=float64, chunksize=(36500, 192, 288), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) object 1900-01-02 00:00:00 ... 2100-01-01 00:00:00
  * lat      (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 87.17 88.12 89.06 90.0
  * lon      (lon) float64 0.0 1.25 2.5 3.75 5.0 ... 355.0 356.2 357.5 358.8


### Export Data - Long Step (8+ hours, Run Overnight)

In [8]:
%%time

obs_outpath = '/glade/campaign/cgd/ppc/eperkins/era5/'  # path for new observational data file
clim_outpath = '/glade/campaign/cgd/ppc/eperkins/cesm/'  # path for new climate model data file
obs_name = 'era5_t2m_1979_2022_1D_MRBplus'  # name for new observational data file
clim_name = 'cesmLE_B1850C5CN_TP_1900_2099_1D_MRBplus'  # name for new climate model data file

# obs_select.load().to_netcdf(obs_outpath+obs_name+'.nc')
clim_select.load().to_netcdf(clim_outpath+clim_name+'.nc')

CPU times: user 16min 35s, sys: 3h 15min 4s, total: 3h 31min 39s
Wall time: 8h 39min 30s
