In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
#note: Dan uses "import glob" -- glob handles wildcards better than xarray for reading in multiple files

In [2]:
import intake

In [3]:
catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")
catalog.df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
0,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,hfss,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,pr,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,rsus,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,tas,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,ts,gn,,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...


In [4]:
test_sr = catalog.search(experiment_id=['historical'], variable_id='co2')

In [5]:
test_sr.unique('source_id')

{'source_id': {'count': 10,
  'values': ['BCC-CSM2-MR',
   'BCC-ESM1',
   'CNRM-ESM2-1',
   'MIROC-ES2L',
   'MRI-ESM2-0',
   'CESM2-FV2',
   'CESM2-WACCM-FV2',
   'CESM2-WACCM',
   'CESM2',
   'GFDL-ESM4']}}

In [6]:
#don't really understand this output
test_sr.unique('member_id')

{'member_id': {'count': 16,
  'values': ['r1i1p1f1',
   'r2i1p1f1',
   'r3i1p1f1',
   'r1i1p1f2',
   'r2i1p1f2',
   'r3i1p1f2',
   'r4i1p1f2',
   'r5i1p1f2',
   'r4i1p1f1',
   'r5i1p1f1',
   'r10i1p1f1',
   'r11i1p1f1',
   'r6i1p1f1',
   'r7i1p1f1',
   'r8i1p1f1',
   'r9i1p1f1']}}

### Reading catalog to load CO2 values for CMIP6 models
Multiple configurations of BCC and CESM available. Choosing one narrows models to 4.
Difficult to tell how many models have multiple ensemble members, so only using first member

In [7]:
table_id = 'Amon'
var = 'co2'
# note that there are different numbers of model output available depending on options used for source_id and member_id
# should I consider using multiple ensemble members per model? multiple model configurations per model?
member_id = ["r1i1p1f1","r2i1p1f1","r3i1p1f1","r4i1p1f1","r5i1p1f1","r6i1p1f1"]
cat = catalog.search(experiment_id=['historical'], variable_id=var,
#                        source_id=test_sr.unique('source_id')['source_id']['values'], 
                        source_id=['CESM2','BCC-ESM1','MRI-ESM2-0','GFDL-ESM4'], 
#                        table_id=table_id, member_id =member_id)
                        table_id=table_id, member_id ="r1i1p1f1")

In [8]:
cat.unique('source_id')['source_id']['values']

['BCC-ESM1', 'MRI-ESM2-0', 'CESM2', 'GFDL-ESM4']

In [9]:
cat.unique('member_id')['member_id']['values']

['r1i1p1f1']

In [10]:
cat.unique('grid_label')

{'grid_label': {'count': 2, 'values': ['gn', 'gr1']}}

### Converting data to dictionary

In [11]:
co2_ds = cat.to_dataset_dict()


xarray will load netCDF datasets with dask using a single chunk for all arrays.
For effective chunking, please provide chunks in cdf_kwargs.
For example: cdf_kwargs={'chunks': {'time': 36}}

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 4 group(s)


In [12]:
co2_ds.keys()

dict_keys(['CMIP.BCC.BCC-ESM1.historical.Amon.gn', 'CMIP.MRI.MRI-ESM2-0.historical.Amon.gn', 'CMIP.NCAR.CESM2.historical.Amon.gn', 'CMIP.NOAA-GFDL.GFDL-ESM4.historical.Amon.gr1'])

### Get the fixed variables output, only available for historical simulations (gridcell area, landfrac)


In [13]:
sr_fx_ctrl_vars = catalog.search(table_id = ['fx'], source_id = cat.unique('source_id')['source_id']['values'], 
                                 member_id = cat.unique('member_id')['member_id']['values'], 
                                 experiment_id = 'historical', variable_id = ['areacella', 'sftlf'])

In [14]:
sr_fx_ctrl_vars.unique('grid_label')

{'grid_label': {'count': 2, 'values': ['gn', 'gr1']}}

In [15]:
fx_ds = sr_fx_ctrl_vars.to_dataset_dict()


xarray will load netCDF datasets with dask using a single chunk for all arrays.
For effective chunking, please provide chunks in cdf_kwargs.
For example: cdf_kwargs={'chunks': {'time': 36}}

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 3 group(s)


In [16]:
#what happened to the BCC model? Doesn't seem to have fx (also checked /glade/collections/cmip/CMIP6/CMIP/BCC)
fx_ds.keys()

dict_keys(['CMIP.MRI.MRI-ESM2-0.historical.fx.gn', 'CMIP.NCAR.CESM2.historical.fx.gn', 'CMIP.NOAA-GFDL.GFDL-ESM4.historical.fx.gr1'])

###   
### Starting data analysis
###   

In [17]:
# This doesn't work as is. Testing portions of it below. 
#for sim_name, data in co2_ds.items():
#    print(data)
#    num_years = data.values.shape[0]//12
#    reshaped = data.values.reshape(num_years, 12, *data.values.shape[1:])
#    values = xr.DataArray(reshaped,
#                          dims=('year','month','lat','lon'),
#                          coords=(np.arange(1850, 1850+num_years), np.arange(12), data.lat, data.lon))
#    values.attrs['units'] = data.attrs['units']
#    co2_ds[sim_name] = values
    
#reshaped  = grid_dims.values.reshape(num_years,12,*grid_dims.values.shape[1:])

AttributeError: 'function' object has no attribute 'shape'

AttributeError: 'dict' object has no attribute 'time'

In [25]:
for sim_name, data in co2_ds.items():
    print(data)
#note that this isn't correct -- the time dimension is the last dimension in each of these. 
    num_years = data.co2.values.shape[5]//12
    print(num_years)
#    num_years = data.co2.values.shape(dim=time)


<xarray.Dataset>
Dimensions:    (bnds: 2, lat: 64, lon: 128, member_id: 1, plev: 19, time: 1980)
Coordinates:
  * member_id  (member_id) <U8 'r1i1p1f1'
  * plev       (plev) float64 1e+05 9.25e+04 8.5e+04 7e+04 ... 1e+03 500.0 100.0
  * lon        (lon) float64 0.0 2.812 5.625 8.438 ... 348.8 351.6 354.4 357.2
  * lat        (lat) float64 -87.86 -85.1 -82.31 -79.53 ... 82.31 85.1 87.86
  * time       (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00
Dimensions without coordinates: bnds
Data variables:
    lon_bnds   (lon, bnds) float64 dask.array<chunksize=(128, 2), meta=np.ndarray>
    lat_bnds   (lat, bnds) float64 dask.array<chunksize=(64, 2), meta=np.ndarray>
    time_bnds  (time, bnds) object dask.array<chunksize=(1980, 2), meta=np.ndarray>
    co2        (member_id, time, plev, lat, lon) float32 dask.array<chunksize=(1, 1980, 19, 64, 128), meta=np.ndarray>
Attributes:
    Conventions:            CF-1.7 CMIP-6.2
    activity_id:            CMIP
    branch_method:          

IndexError: tuple index out of range

In [17]:
years_mo = co2_ds.values.reshape(num_years,12,*co2_ds.values.shape[1:])

AttributeError: 'builtin_function_or_method' object has no attribute 'shape'

In [37]:
val  = co2_ds.get('CMIP.MRI.MRI-ESM2-0.historical.fx.gn')
print(val)

None
