In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
#note: Dan uses "import glob" -- glob handles wildcards better than xarray for reading in multiple files

In [2]:
import intake

In [3]:
catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")
catalog.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
0,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,hfss,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,pr,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,rsus,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,tas,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,ts,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...


In [4]:
test_sr = catalog.search(experiment_id=['historical'], variable_id='co2')

In [5]:
test_sr.unique('source_id')

{'source_id': {'count': 10,
  'values': ['BCC-CSM2-MR',
   'BCC-ESM1',
   'CNRM-ESM2-1',
   'MIROC-ES2L',
   'MRI-ESM2-0',
   'CESM2-FV2',
   'CESM2-WACCM-FV2',
   'CESM2-WACCM',
   'CESM2',
   'GFDL-ESM4']}}

In [None]:
#don't really understand this output
test_sr.unique('member_id')

### Reading catalog to load CO2 values for CMIP6 models
Multiple configurations of BCC and CESM available. Choosing one narrows models to 4.
Difficult to tell how many models have multiple ensemble members, so only using first member

In [6]:
table_id = 'Amon'
var = 'co2'
# note that there are different numbers of model output available depending on options used for source_id and member_id
# should I consider using multiple ensemble members per model? multiple model configurations per model?
#member_id = ["r1i1p1f1","r2i1p1f1","r3i1p1f1","r4i1p1f1","r5i1p1f1","r6i1p1f1"]
# note that MIROC doesn't show up here because it's member_id = r1i2p1f1. Not sure why this is numbered differently
cat = catalog.search(experiment_id=['historical'], variable_id=var,
#                        source_id=test_sr.unique('source_id')['source_id']['values'], 
                        source_id=['CESM2','BCC-ESM1','BCC-CSM2-MR','MRI-ESM2-0','GFDL-ESM4'], 
#                        table_id=table_id, member_id =member_id)
                        table_id=table_id, member_id ="r1i1p1f1")

In [7]:
cat.unique('source_id')['source_id']['values']

['BCC-CSM2-MR', 'BCC-ESM1', 'MRI-ESM2-0', 'CESM2', 'GFDL-ESM4']

In [8]:
cat.unique('member_id')['member_id']['values']

['r1i1p1f1']

In [9]:
cat.unique('grid_label')

{'grid_label': {'count': 2, 'values': ['gn', 'gr1']}}

### Converting data to dictionary

In [10]:
co2_ds = cat.to_dataset_dict()


xarray will load netCDF datasets with dask using a single chunk for all arrays.
For effective chunking, please provide chunks in cdf_kwargs.
For example: cdf_kwargs={'chunks': {'time': 36}}

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 5 group(s)


In [11]:
co2_ds.keys()

dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'CMIP.BCC.BCC-ESM1.historical.Amon.gn', 'CMIP.MRI.MRI-ESM2-0.historical.Amon.gn', 'CMIP.NCAR.CESM2.historical.Amon.gn', 'CMIP.NOAA-GFDL.GFDL-ESM4.historical.Amon.gr1'])

### Get the fixed variables output, only available for historical simulations (gridcell area, landfrac)


In [None]:
sr_fx_ctrl_vars = catalog.search(table_id = ['fx'], source_id = cat.unique('source_id')['source_id']['values'], 
                                 member_id = cat.unique('member_id')['member_id']['values'], 
                                 experiment_id = 'historical', variable_id = ['areacella', 'sftlf'])

In [None]:
sr_fx_ctrl_vars.unique('grid_label')

In [None]:
fx_ds = sr_fx_ctrl_vars.to_dataset_dict()

In [None]:
#what happened to BCC? Doesn't seem to have fx (also checked /glade/collections/cmip/CMIP6/CMIP/BCC and online archive)
fx_ds.keys()

###   
### Starting data analysis
###   

In [12]:
for sim_name, data in co2_ds.items():
# set_coords based on M. Long's suggestion so that subtraction not applied to coordinate variables
# works on on datasets, but not on the dictionary.
    data    = data.set_coords(['time_bnds', 'lat_bnds', 'lon_bnds'])
#    datamax = data.resample(time="Y").max()
#    datamin = data.resample(time="Y").min()
#    co2amp  = datamax.co2 - datamin.co2
    co2amp  = data.resample(time="Y").max() - data.resample(time="Y").min()
    print(co2amp)
    co2_ds[sim_name] = co2amp

<xarray.Dataset>
Dimensions:    (bnds: 2, lat: 160, lon: 320, member_id: 1, plev: 19, time: 165)
Coordinates:
  * time       (time) object 1850-12-31 00:00:00 ... 2014-12-31 00:00:00
    lon_bnds   (lon, bnds) float64 -0.5625 0.5625 0.5625 ... 358.3 358.3 359.4
    lat_bnds   (lat, bnds) float64 -90.0 -88.59 -88.59 ... 88.59 88.59 90.0
  * member_id  (member_id) <U8 'r1i1p1f1'
  * plev       (plev) float64 1e+05 9.25e+04 8.5e+04 7e+04 ... 1e+03 500.0 100.0
  * lat        (lat) float64 -89.14 -88.03 -86.91 -85.79 ... 86.91 88.03 89.14
  * lon        (lon) float64 0.0 1.125 2.25 3.375 ... 355.5 356.6 357.8 358.9
Dimensions without coordinates: bnds
Data variables:
    co2        (time, member_id, plev, lat, lon) float32 dask.array<chunksize=(1, 1, 19, 160, 320), meta=np.ndarray>
<xarray.Dataset>
Dimensions:    (bnds: 2, lat: 64, lon: 128, member_id: 1, plev: 19, time: 165)
Coordinates:
  * time       (time) object 1850-12-31 00:00:00 ... 2014-12-31 00:00:00
    lon_bnds   (lon, bnds) flo

In [None]:
# Note: This doesn't work due to calendar mismatches.
# See example here for reading in a dataset: https://geocat-examples.readthedocs.io/en/latest/gallery/XY/NCL_xy_18.html#sphx-glr-gallery-xy-ncl-xy-18-py
# Not sure how to do this for reading in a dictionary

#for amp_name, amp in co2_ds.items():
#    amp_stnd = amp.sel(time=slice('1850','2014')) - amp.sel(time=slice('1850'))

In [13]:
#From Matt Long:
def infer_lat_name(ds): 
    lat_names = ['latitude', 'lat']
    for n in lat_names:
        if n in ds:
            return n
    raise ValueError('could not determine lat name')    


def infer_lon_name(ds):
    lon_names = ['longitude', 'lon']
    for n in lon_names:
        if n in ds:
            return n
    raise ValueError('could not determine lon name')

In [15]:
#Also from Matt Long:
def lat_weights_regular_grid(lat):
    """
    Generate latitude weights for equally spaced (regular) global grids.
    Weights are computed as sin(lat+dlat/2)-sin(lat-dlat/2) and sum to 2.0.
    """  
    dlat = np.abs(np.diff(lat))
    np.testing.assert_almost_equal(dlat, dlat[0])
    w = np.abs(np.sin(np.radians(lat + dlat[0] / 2.)) - np.sin(np.radians(lat - dlat[0] / 2.)))

    if np.abs(lat[0]) > 89.9999:
        w[0] = np.abs(1. - np.sin(np.radians(np.pi / 2 - dlat[0])))

    if np.abs(lat[-1]) > 89.9999:
        w[-1] = np.abs(1. - np.sin(np.radians(np.pi / 2 - dlat[0])))

    return w

In [16]:
#Also from Matt Long:
def compute_grid_area(ds, check_total=True):
    """Compute the area of grid cells."""
   
    radius_earth = 6.37122e6 # m, radius of Earth
    area_earth = 4.0 * np.pi * radius_earth**2 # area of earth [m^2]e
   
    lon_name = infer_lon_name(ds)      
    lat_name = infer_lat_name(ds)        
   
    weights = lat_weights_regular_grid(ds[lat_name])
    area = weights + 0.0 * ds[lon_name] # add 'lon' dimension
    area = (area_earth / area.sum(dim=(lat_name, lon_name))) * area
   
    if check_total:
        np.testing.assert_approx_equal(np.sum(area), area_earth)
       
    return xr.DataArray(area, dims=(lat_name, lon_name), attrs={'units': 'm^2', 'long_name': 'area'})

### To Do next
 - atm level for CO2 (looks like all are standardized at 19 levels)
 - decide what region (point, global, NH) to get data for, and figure out how to do the appropriate weighting
     - note: if just choosing 30N, perhaps the lat/lon weighting Katie tried is enough -- I don't need the fx vars
 - what kind of figure to make? Annual cycle? or just a value?
 Maybe start with plotting annual cycle for all CMIP6 models N of 30N

In [None]:
val  = co2_ds.get('CMIP.MRI.MRI-ESM2-0.historical.fx.gn')
print(val)