In [1]:
import xarray as xr
xr.set_options(display_style='html')
import intake
import cftime
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import numpy as np
import pandas as pd
import datetime
import seaborn as sns

### Open CMIP6 online catalog

In [2]:
cat_url = '/mnt/craas1-ns9989k-geo4992/data/catalogs/cesm-ppe.json'
col = intake.open_esm_datastore(cat_url)
col

Unnamed: 0,unique
experiment,1
ensemble,262
frequency,2
variable,124
units,27
long_name,124
vertical_levels,3
start_time,2
end_time,3
time_range,3



### Search corresponding data 

Please check [here](https://pangeo-data.github.io/escience-2022/pangeo101/data_discovery.html?highlight=cmip6) for info about CMIP and variables :) 

Particularly useful is maybe the variable search which you find here: https://clipc-services.ceda.ac.uk/dreq/mipVars.html 

In [3]:
cat = col.search(
    #experiment=['present-day','pre-industrial'],
    variable = ['CLDTOT','ACTNL','CCN3','FCTL'],
    frequency='monthly'
    
    # source_id=['CESM2'],#,'CanESM5-1'], 
    # experiment_id=['historical'], table_id=['Amon','fx','AERmon'], 
                 #variable_id=[
                 #    'tas',
                 #    'hurs',
                 #    'mmrso4'
                 #    ], 
                 #member_id=['r1i1p1f1'])
)
#
#
#
cat.df


Unnamed: 0,experiment,ensemble,frequency,variable,units,long_name,vertical_levels,start_time,end_time,time_range,path
0,present-day,0.0,monthly,ACTNL,m-3,Average Cloud Top droplet number,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
1,present-day,1.0,monthly,ACTNL,m-3,Average Cloud Top droplet number,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
2,present-day,2.0,monthly,ACTNL,m-3,Average Cloud Top droplet number,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
3,present-day,3.0,monthly,ACTNL,m-3,Average Cloud Top droplet number,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
4,present-day,4.0,monthly,ACTNL,m-3,Average Cloud Top droplet number,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
...,...,...,...,...,...,...,...,...,...,...,...
1043,present-day,258.0,monthly,FCTL,fraction,Fractional occurrence of cloud top liquid,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
1044,present-day,259.0,monthly,FCTL,fraction,Fractional occurrence of cloud top liquid,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
1045,present-day,260.0,monthly,FCTL,fraction,Fractional occurrence of cloud top liquid,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...
1046,present-day,261.0,monthly,FCTL,fraction,Fractional occurrence of cloud top liquid,1.0,0001-01-16,0003-12-16,0001-01-16-0003-12-16,/mnt/craas1-ns9989k-ns9560k/CAM6_CESM_PPE/PD/m...


In [4]:
cat.df['variable'].unique()

array(['ACTNL', 'CCN3', 'CLDTOT', 'FCTL'], dtype=object)

### Create dictionary from the list of datasets we found
- This step may take several minutes so be patient!

In [9]:
sep_labels=['experiment','variable','frequency','ensemble']

In [5]:
cat.esmcat.aggregation_control.groupby_attrs = sep_labels
cat.esmcat.aggregation_control#['groupby_attrs']

AggregationControl(variable_column_name='variable', groupby_attrs=['experiment', 'variable', 'frequency', 'ensemble'], aggregations=[Aggregation(type=<AggregationType.union: 'union'>, attribute_name='variable', options={}), Aggregation(type=<AggregationType.join_existing: 'join_existing'>, attribute_name='time_range', options={'dim': 'time', 'coords': 'minimal', 'compat': 'override'}), Aggregation(type=<AggregationType.join_existing: 'join_existing'>, attribute_name='ensemble', options={'dim': 'ensamble', 'coords': 'minimal', 'compat': 'override'})])

In [6]:
dset_dict = cat.to_dataset_dict(zarr_kwargs={'use_cftime':True})


--> The keys in the returned dictionary of datasets are constructed as follows:
	'experiment.variable.frequency.ensemble'


In [7]:
dset_dict.keys()

dict_keys(['present-day.CLDTOT.monthly.217.0', 'present-day.CLDTOT.monthly.77.0', 'present-day.CCN3.monthly.78.0', 'present-day.ACTNL.monthly.256.0', 'present-day.CLDTOT.monthly.178.0', 'present-day.ACTNL.monthly.239.0', 'present-day.CLDTOT.monthly.9.0', 'present-day.ACTNL.monthly.203.0', 'present-day.FCTL.monthly.37.0', 'present-day.CLDTOT.monthly.39.0', 'present-day.FCTL.monthly.55.0', 'present-day.ACTNL.monthly.170.0', 'present-day.ACTNL.monthly.261.0', 'present-day.FCTL.monthly.15.0', 'present-day.ACTNL.monthly.36.0', 'present-day.FCTL.monthly.89.0', 'present-day.FCTL.monthly.220.0', 'present-day.ACTNL.monthly.132.0', 'present-day.CCN3.monthly.10.0', 'present-day.CLDTOT.monthly.133.0', 'present-day.FCTL.monthly.197.0', 'present-day.FCTL.monthly.72.0', 'present-day.CLDTOT.monthly.144.0', 'present-day.FCTL.monthly.245.0', 'present-day.FCTL.monthly.154.0', 'present-day.CLDTOT.monthly.85.0', 'present-day.FCTL.monthly.259.0', 'present-day.ACTNL.monthly.226.0', 'present-day.CLDTOT.monthl

In [8]:
keys = list(dset_dict.keys())

In [18]:
dic_nested = dict()
for key in dset_dict.keys():
    _sep_labs = key.split('.')
    cat, var, freq, en_mem = _sep_labs[:-1]
    if cat not in dic_nested.keys():
        dic_nested[cat] =dict()
    if var not in dic_nested[cat]:
         dic_nested[cat][var] =dict()
    dic_nested[cat][var][en_mem] = dset_dict[key]
    

In [37]:
dic_merged = dict()
for cat in dic_nested.keys():
    dic_merged[cat] = dict()
    for var in dic_nested[cat].keys():
        ls = []
        for en_mem in dic_nested[cat][var].keys():
            _ds = dic_nested[cat][var][en_mem]
            _ds['ens_mem'] = int(en_mem)
            _ds = _ds.set_coords('ens_mem')

            ls.append(_ds)
        _ds_c = xr.concat(ls, dim='ens_mem')
        _ds_c = _ds_c.sortby('ens_mem')
        dic_merged[cat][var] = _ds_c
        
    
    

In [38]:
dic_merged[cat][var]

Unnamed: 0,Array,Chunk
Bytes,1.94 GiB,7.59 MiB
Shape,"(262, 36, 192, 288)","(1, 36, 192, 288)"
Dask graph,262 chunks in 788 graph layers,262 chunks in 788 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.94 GiB 7.59 MiB Shape (262, 36, 192, 288) (1, 36, 192, 288) Dask graph 262 chunks in 788 graph layers Data type float32 numpy.ndarray",262  1  288  192  36,

Unnamed: 0,Array,Chunk
Bytes,1.94 GiB,7.59 MiB
Shape,"(262, 36, 192, 288)","(1, 36, 192, 288)"
Dask graph,262 chunks in 788 graph layers,262 chunks in 788 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Nice possible variables to check out: 


- ACTNL - Average Cloud Top droplet number - grid box average (not in-cloud, ask what this means if you don't know :) )
- FCTL - Fractional occurrence of cloud top liquid. If you avg ACTNL and FCTL and then divide ACTNL by FCTL, you get the avg in-cloud ACTNL weigted by cloud occurance.
- CCN3 - CCN concentration at S=0.1%. 