# Global averaging

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from scipy.signal import detrend
from matplotlib import pyplot as plt
from scipy import signal
import pandas as pd
import xarray as xr
import intake
import pprint
import util 
import os

col_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(col_url)

file = 'available_tas_data_cloud_june12th_2020.txt'

In [2]:
# load table:
data_table = pd.read_table(file,index_col=0)
models_used = data_table['piControl (yrs)'].index
print(models_used)

Index(['IPSL-CM6A-LR', 'MRI-ESM2-0', 'MPI-ESM1-2-LR', 'TaiESM1',
       'AWI-CM-1-1-MR', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1',
       'CAMS-CSM1-0', 'FGOALS-f3-L', 'FGOALS-g3', 'IITM-ESM', 'CanESM5',
       'CanESM5-CanOE', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1',
       'ACCESS-ESM1-5', 'ACCESS-CM2', 'E3SM-1-0', 'E3SM-1-1', 'E3SM-1-1-ECA',
       'EC-Earth3', 'EC-Earth3-LR', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR',
       'FIO-ESM-2-0', 'MPI-ESM-1-2-HAM', 'INM-CM4-8', 'INM-CM5-0',
       'MIROC-ES2L', 'MIROC6', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM',
       'UKESM1-0-LL', 'MPI-ESM1-2-HR', 'GISS-E2-1-G', 'GISS-E2-1-G-CC',
       'GISS-E2-1-H', 'GISS-E2-2-G', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM',
       'CESM2-WACCM-FV2', 'NorCPM1', 'NorESM1-F', 'NorESM2-LM', 'NorESM2-MM',
       'KACE-1-0-G', 'GFDL-CM4', 'GFDL-ESM4', 'NESM3', 'SAM0-UNICON', 'CIESM',
       'MCM-UA-1-0'],
      dtype='object')


In [3]:
# load functions:
def area_weights(lat_bnds, lon_bnds): 
    # computes exact area weigths assuming earth is a perfect sphere
    lowerlats = np.radians(lat_bnds[:,0]); upperlats = np.radians(lat_bnds[:,1])
    difflon = np.radians(np.diff(lon_bnds[0,:])) # if the differences in longitudes are all the same
    areaweights = difflon*(np.sin(upperlats) - np.sin(lowerlats));
    areaweights /= areaweights.mean()
    return areaweights # list of weights, of same dimension as latitude

# function copied from: http://xarray.pydata.org/en/stable/examples/monthly-means.html
def leap_year(year, calendar='standard'):
    """Determine if year is a leap year"""
    leap = False
    if ((calendar in ['standard', 'gregorian',
        'proleptic_gregorian', 'julian']) and
        (year % 4 == 0)):
        leap = True
        if ((calendar == 'proleptic_gregorian') and
            (year % 100 == 0) and
            (year % 400 != 0)):
            leap = False
        elif ((calendar in ['standard', 'gregorian']) and
                 (year % 100 == 0) and (year % 400 != 0) and
                 (year < 1583)):
            leap = False
    return leap

# function copied from: http://xarray.pydata.org/en/stable/examples/monthly-means.html
def get_dpm(time, calendar='standard'):
    """
    return a array of days per month corresponding to the months provided in `months`
    """
    month_length = np.zeros(len(time), dtype=np.int)

    cal_days = dpm[calendar]

    for i, (month, year) in enumerate(zip(time.month, time.year)):
        month_length[i] = cal_days[month]
        if leap_year(year, calendar=calendar) and month == 2:
            month_length[i] += 1
    return month_length

# days per month:
dpm = {'noleap': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       'gregorian': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       'julian': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31], ##### I think this should be correct
       'proleptic_gregorian': [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31],
       '360_day': [0, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]
      }

def compute_day_weights(ds, calendar = 'noleap'): # new function
    month_length = xr.DataArray(get_dpm((ds.time.to_index()), calendar=ds_calendar), coords=[ds.time], name='month_length')
    
    ##### This code is only tested for noleap so far #####
    norm_by_annual = month_length.groupby('time.year').mean('time') # make annual mean
    norm_by_monthly = np.concatenate([np.tile(norm_by_annual.values[i], 12) for i in range(len(norm_by_annual.values))])
    weights = month_length/norm_by_monthly
    # normalized to have mean 1
    return weights 

def calendar_check(model):
    # Time formats for piControl, found from manual check:
    if model in ['TaiESM1', 'BCC-CSM2-MR', 'BCC-ESM1', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'FGOALS-f3-L', 'FGOALS-g3', 'CanESM5', 'CanESM5-CanOE', 'E3SM-1-0', 'E3SM-1-1', 'E3SM-1-1-ECA', 'FIO-ESM-2-0', 'INM-CM4-8', 'INM-CM5-0', 'GISS-E2-1-G', 'GISS-E2-1-G-CC', 'GISS-E2-1-H', 'GISS-E2-2-G', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM', 'CESM2-WACCM-FV2', 'NorCPM1', 'NorESM1-F', 'NorESM2-LM', 'NorESM2-MM', 'GFDL-CM4', 'SAM0-UNICON', 'GFDL-ESM4', 'CIESM', 'MCM-UA-1-0']:
        ds_calendar = 'noleap'
    elif model in ['CNRM-CM6-1', 'IPSL-CM6A-LR', 'MIROC-ES2L', 'MIROC6', 'NESM3']:
        ds_calendar = 'gregorian'
    elif model in ['AWI-CM-1-1-MR', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'ACCESS-ESM1-5', 'ACCESS-CM2', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-LR', 'MPI-ESM1-2-HR']:
        ds_calendar = 'proleptic_gregorian'
    elif model in ['UKESM1-0-LL', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM', 'CNRM-ESM2-1', 'KACE-1-0-G', 'MRI-ESM2-0']:
        ds_calendar = '360_day'
        if model in ['CNRM-ESM2-1', 'MRI-ESM2-0']:
            print('piControl is 360_day, the other experiments unknown')
    elif model in ['IITM-ESM']:
        ds_calendar = 'julian'
    elif model in ['AWI-ESM-1-1-LR', 'CNRM-CM6-1-HR', 'EC-Earth3', 'EC-Earth3-LR']:
        #ds_calendar = 'datetime64'
        print('not 100% sure what calendar this model has, but a guess is made based on other models from same institution')
        if model in ['EC-Earth3', 'EC-Earth3-LR', 'AWI-ESM-1-1-LR']:
            print('calendar is likely proleptic gregorian')
            ds_calendar = 'proleptic_gregorian'
        elif model in ['CNRM-CM6-1-HR']:
            print('calendar is likely gregorian')
            ds_calendar = 'gregorian'
    return ds_calendar
    

# Choose model, exp, member

In [51]:
#model = models_used[10];
model = 'CNRM-CM6-1'

print(model)
ds_calendar = calendar_check(model)
print(ds_calendar, 'calendar')

#check what experiments are available for var
var = 'tas'
#cat = col.search(experiment_id = ['piControl', 'abrupt-4xCO2', 'abrupt-2xCO2', 'abrupt-0p5xCO2', 'historical', 'ssp126', 'ssp245', 'ssp370', 'ssp585'], source_id = model, variable_id=var, table_id='Amon') 
cat = col.search(experiment_id = 'piControl', source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon') 
#cat = col.search(experiment_id = 'historical', source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon') 


cat.df


CNRM-CM6-1
gregorian calendar


Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
23354,CMIP,CNRM-CERFACS,CNRM-CM6-1,piControl,r1i1p1f2,Amon,rlut,gr,gs://cmip6/CMIP/CNRM-CERFACS/CNRM-CM6-1/piCont...,,20180814
23358,CMIP,CNRM-CERFACS,CNRM-CM6-1,piControl,r1i1p1f2,Amon,rsdt,gr,gs://cmip6/CMIP/CNRM-CERFACS/CNRM-CM6-1/piCont...,,20180814
23361,CMIP,CNRM-CERFACS,CNRM-CM6-1,piControl,r1i1p1f2,Amon,rsut,gr,gs://cmip6/CMIP/CNRM-CERFACS/CNRM-CM6-1/piCont...,,20180814
23365,CMIP,CNRM-CERFACS,CNRM-CM6-1,piControl,r1i1p1f2,Amon,tas,gr,gs://cmip6/CMIP/CNRM-CERFACS/CNRM-CM6-1/piCont...,,20180814


In [52]:
exp = 'piControl'
#exp = 'abrupt-4xCO2'
#exp = 'abrupt-2xCO2'
#exp = 'abrupt-0p5xCO2'
#exp = 'historical'
#exp = 'ssp126'
#exp = 'ssp245'
#exp = 'ssp370'
#exp = 'ssp585'
#exp = 'piClim-4xCO2'
#exp = 'piClim-control'
#exp = 'piClim-histall'


if model == 'UKESM1-0-LL':
    cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon', institution_id = 'MOHC')
    #cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon', institution_id = 'NIMS-KMA')
if model == 'MPI-ESM1-2-HR': # select institution if there are two groups
    cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon')
    #cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon', institution_id = 'DKRZ') 
    #cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon', institution_id = 'DWD') 
elif model == 'MRI-ESM2-0' and exp == 'ssp585': 
    cat = col.search(experiment_id = exp, source_id = model, variable_id=['rlut'], table_id='Amon', member_id = 'r1i1p1f1') 
    #cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rsut', 'rsdt'], table_id='Amon', member_id = 'r1i1p1f1')
elif model in ['MPI-ESM1-2-LR', 'IPSL-CM6A-LR', 'MRI-ESM2-0'] and exp in ['piControl']: # problem when loading more than one member simultaneously, so specify member_id:
    cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon', member_id = 'r1i2p1f1')
else:
    cat = col.search(experiment_id = exp, source_id = model, variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon') 
    
if 'dset_dict' in locals():
    del dset_dict
if 'ds_exp' in locals():
    del ds_exp
dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True}, cdf_kwargs={'chunks': {}})
#for key in ['CMIP.MOHC.UKESM1-0-LL.historical.Amon.gn']:
for key in dset_dict.keys():
    ds_exp = dset_dict[key]
    if 'members_sorted' in locals():
        del members_sorted
    members_sorted = ds_exp.member_id.sortby(ds_exp.member_id)
    
if model == 'MCM-UA-1-0':
    ds_exp = ds_exp.rename({'longitude': 'lon','latitude': 'lat'}) 
        
for member in members_sorted:
    print(member.values)
    
# write out data variables, to check that we have all we want
ds_exp.data_vars

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 1 group(s)
r1i1p1f2


Data variables:
    rlut     (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
    rsdt     (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
    rsut     (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
    height   float64 ...
    tas      (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>

In [53]:
ds_exp.grid

'data regridded to a T127 gaussian grid (128x256 latlon) from a native atmosphere T127l reduced gaussian grid'

In [54]:
#pd.set_option('display.max_colwidth', -1)
#cat.df['zstore']
ds_exp['time']

<xarray.DataArray 'time' (time: 6000)>
array([cftime.DatetimeGregorian(1850, 1, 16, 12, 0, 0, 0, 2, 16),
       cftime.DatetimeGregorian(1850, 2, 15, 0, 0, 0, 0, 4, 46),
       cftime.DatetimeGregorian(1850, 3, 16, 12, 0, 0, 0, 5, 75), ...,
       cftime.DatetimeGregorian(2349, 10, 16, 12, 0, 0, 0, 6, 289),
       cftime.DatetimeGregorian(2349, 11, 16, 0, 0, 0, 0, 2, 320),
       cftime.DatetimeGregorian(2349, 12, 16, 12, 0, 0, 0, 4, 350)],
      dtype=object)
Coordinates:
  * time     (time) object 1850-01-16 12:00:00 ... 2349-12-16 12:00:00
Attributes:
    axis:           T
    bounds:         time_bounds
    long_name:      Time axis
    standard_name:  time
    time_origin:    1850-01-01 00:00:00

In [55]:
ds_exp

<xarray.Dataset>
Dimensions:    (lat: 128, lon: 256, member_id: 1, time: 6000)
Coordinates:
  * lon        (lon) float64 0.0 1.406 2.812 4.219 ... 354.4 355.8 357.2 358.6
  * lat        (lat) float64 -88.93 -87.54 -86.14 -84.74 ... 86.14 87.54 88.93
  * time       (time) object 1850-01-16 12:00:00 ... 2349-12-16 12:00:00
  * member_id  (member_id) <U8 'r1i1p1f2'
Data variables:
    rlut       (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
    rsdt       (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
    rsut       (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
    height     float64 ...
    tas        (member_id, time, lat, lon) float32 dask.array<chunksize=(1, 600, 128, 256), meta=np.ndarray>
Attributes:
    CMIP6_CV_version:       cv=6.2.3.0-7-g2019642
    parent_activity_id:     CMIP
    dr2xml_version:         1.0
    history:                n

In [81]:
#ds_exp.rlut.isel(time = 10).plot()
#dset_dict.keys()

In [82]:
# testing model EC earth3
#ds_test = ds_exp.sel(member_id = 'r4i1p1f1')
#ds_test['tas'].isel(time=0).values[0]
#import sys
#np.set_printoptions(threshold=sys.maxsize)
#ds_test['tas'].isel(time=0).values[0].shape
#ds_test.lon

In [45]:
# loop through members
#for member in [members_sorted.sel(member_id = 'r2i1p3f1')]:
for member in members_sorted:
    print(member.values)

    ds = ds_exp.sel(member_id = member)
    # compute weights for average
    if model == 'NorCPM1' and exp == 'historical':
        area_w = area_weights(ds.lat_bnds.values[0,:,:], ds.lon_bnds.values)
    else:
        area_w = area_weights(ds.lat_bnds.values, ds.lon_bnds.values)
    day_weights = compute_day_weights(ds)

    #if exp == 'piClim-4xCO2':
    #    varlist = ['rlut', 'rsut', 'rsdt']
    #else:
    varlist = ['tas', 'rlut', 'rsut', 'rsdt']
    
    for variable in varlist:
        print(variable)
        
        #if model == 'MRI-ESM2-0' and exp == 'ssp585' and member == 'r1i1p1f1' and variable == 'rlut':
        #    cat = col.search(experiment_id = exp, source_id = model, variable_id=['rlut'], table_id='Amon', member_id = 'r1i1p1f1')
        #    dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True}, cdf_kwargs={'chunks': {}})
        #    for key in dset_dict.keys():
        #        ds_exp = dset_dict[key]
        #        ds = ds_exp.sel(member_id = member)
        data = ds[variable]
                
        # global average
        area_avg = (data.transpose('time', 'lon', 'lat') * area_w).mean(dim=['lon', 'lat'])

        # annual average
        day_weighted_avg = area_avg*day_weights
        annualmean = day_weighted_avg.groupby('time.year').mean('time')

        if variable == varlist[0]:
            # create dataframe for storing all results
            df = pd.DataFrame(annualmean.values, columns = [variable])
        else:
            df_col = pd.DataFrame(annualmean.values, columns = [variable])
            df = pd.merge(df, df_col, left_index=True, right_index=True, how='outer')
            
    filename = model + '_' + exp + '_' + str(member.values) + '_means.txt'
    file = os.path.join('../Processed_data/Global_annual_means/', model, exp, filename)
    if member == members_sorted[0]: # create directory for first member
        os.makedirs(os.path.dirname(file), exist_ok=False)

    df.to_csv(file)
    

r1i1p1f1
tas


KeyboardInterrupt: 

In [46]:
ds.time.to_index()

CFTimeIndex([1850-01-16 12:00:00, 1850-02-15 00:00:00, 1850-03-16 12:00:00,
             1850-04-16 00:00:00, 1850-05-16 12:00:00, 1850-06-16 00:00:00,
             1850-07-16 12:00:00, 1850-08-16 12:00:00, 1850-09-16 00:00:00,
             1850-10-16 12:00:00,
             ...
             2349-03-16 12:00:00, 2349-04-16 00:00:00, 2349-05-16 12:00:00,
             2349-06-16 00:00:00, 2349-07-16 12:00:00, 2349-08-16 12:00:00,
             2349-09-16 00:00:00, 2349-10-16 12:00:00, 2349-11-16 00:00:00,
             2349-12-16 12:00:00],
            dtype='object', name='time', length=6000)

In [49]:
test.year

array([1850, 1850, 1850, ..., 2349, 2349, 2349])

In [47]:
test = ds.time.to_index()

In [39]:
test

Index([1991-01-16 12:00:00, 1991-02-15 00:00:00, 1991-03-16 12:00:00,
       1991-04-16 00:00:00, 1991-05-16 12:00:00, 1991-06-16 00:00:00,
       1991-07-16 12:00:00, 1991-08-16 12:00:00, 1991-09-16 00:00:00,
       1991-10-16 12:00:00,
       ...
       2419-03-16 12:00:00, 2419-04-16 00:00:00, 2419-05-16 12:00:00,
       2419-06-16 00:00:00, 2419-07-16 12:00:00, 2419-08-16 12:00:00,
       2419-09-16 00:00:00, 2419-10-16 12:00:00, 2419-11-16 00:00:00,
       2419-12-16 12:00:00],
      dtype='object', name='time', length=2424)

In [None]:
df

In [None]:
# NorCPM1 historical: nans in lat_bnds

# Updated: NorESM, NorCPM, CESM2, GISS


In [47]:
model

'GISS-E2-1-G'

In [99]:
# check dataset availability:
#col_url = "https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json"
col_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(col_url)
#cat = col.search(experiment_id = 'piClim-4xCO2', variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon')
#cat = col.search(experiment_id = 'piClim-control', source_id = 'BCC-ESM1', variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon')
#cat = col.search(experiment_id = 'piClim-control', variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon')
cat = col.search(experiment_id = 'historical', source_id = 'NorCPM1', variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon')
cat.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year,version
68117,CMIP,NCC,NorCPM1,historical,r10i1p1f1,Amon,rlut,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r10i1p1...,,20190914
68121,CMIP,NCC,NorCPM1,historical,r10i1p1f1,Amon,rsdt,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r10i1p1...,,20190914
68124,CMIP,NCC,NorCPM1,historical,r10i1p1f1,Amon,rsut,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r10i1p1...,,20190914
68126,CMIP,NCC,NorCPM1,historical,r10i1p1f1,Amon,tas,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r10i1p1...,,20190914
68190,CMIP,NCC,NorCPM1,historical,r11i1p1f1,Amon,rlut,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r11i1p1...,,20190914
...,...,...,...,...,...,...,...,...,...,...,...
69948,CMIP,NCC,NorCPM1,historical,r8i1p1f1,Amon,tas,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r8i1p1f...,,20190914
70002,CMIP,NCC,NorCPM1,historical,r9i1p1f1,Amon,rlut,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r9i1p1f...,,20190914
70006,CMIP,NCC,NorCPM1,historical,r9i1p1f1,Amon,rsdt,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r9i1p1f...,,20190914
70009,CMIP,NCC,NorCPM1,historical,r9i1p1f1,Amon,rsut,gn,gs://cmip6/CMIP/NCC/NorCPM1/historical/r9i1p1f...,,20190914
