# curating  data for global calculations

# Import general packages
- if it is the first time running this notebook, will need to set up environment ->
locally I'm just using my stitches interpreter

In [1]:
import stitches as stitches


import pandas as pd
import pkg_resources
import xarray as xr
import numpy as np
import seaborn as sns

# Plotting options
sns.set(font_scale=1.3)
sns.set_style("white")
# For help with plotting
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 12, 6
pd.set_option('display.max_columns', None)

In [2]:
# Time slices
ref_start = '1995-01-01'
ref_end =  '2014-12-31'

comp_start = '2015-01-01'
comp_end =  '2099-12-31'

window_length = 20


# specify ESMs, variables, experiments

In [3]:
# The CMIP6 ESM we want to emulate and the variables we want to
# emulate
# NOTE IPSL and GFDL submitted results under grids labeled not `gn` so they
# are not included in the stitches patches data. To pull their ESMs, we have to
# source the pangeo table directly from pangeo and reshape it instead of using
# the stitches package data.


esm = ['CAMS-CSM1-0', 'MIROC6', 'GFDL-ESM4', 'FGOALS-g3',
'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0',
'ACCESS-ESM1-5', 'IPSL-CM6A-LR', 'CESM2-WACCM',
'UKESM1-0-LL',
'CanESM5']

vars1 = ['tas']

exps = ['historical','ssp126', 'ssp245', 'ssp370',  'ssp585',
        'ssp460', 'ssp119',   'ssp434', 'ssp534-over']

# Pull pangeo dataframe with netcdf addresses for above

In [4]:
# pangeo table of ESMs for reference
pangeo_data = stitches.fx_pangeo.fetch_pangeo_table()

pangeo_data = pangeo_data[(pangeo_data['source_id'].isin(esm)) &
                           (pangeo_data['variable_id'].isin(vars1)) &(pangeo_data['table_id'] == 'Amon')&
                           ((pangeo_data['experiment_id'].isin(exps)))].copy()

# reshape to look like package data but with the ESMs we want to include
pangeo_data = pangeo_data[["source_id", "experiment_id", "member_id", "variable_id", "grid_label",
                                                        "zstore", "table_id"]].copy()
pangeo_data = pangeo_data.rename(columns={"source_id": "model", "experiment_id": "experiment",
                                                "member_id": "ensemble", "variable_id": "variable",
                                                "zstore": "zstore", "table_id": "domain"}).reset_index(drop = True).copy()

# keep only p1 runs:
# UK model only does f2 runs for some reason
ukesm_data =  pangeo_data[pangeo_data['model'].str.contains('UKESM')].copy()
ukesm_data = ukesm_data[ukesm_data['ensemble'].str.contains('i1p1f2')].copy()

# everyone else does f1 runs
pangeo_data = pangeo_data[pangeo_data['ensemble'].str.contains('i1p1f1')].copy()

# combine UKESM with other models
pangeo_data = pd.concat([pangeo_data, ukesm_data]).reset_index(drop=True).copy()

# Ensemble member 1 only:
pangeo_data = pangeo_data[pangeo_data['ensemble'].str.contains('r1i1')].copy()

# loop over files and do calculations

In [5]:
varname = vars1[0]

holder = pd.DataFrame()

for esmname in esm:
  for exp in exps:

    print(esmname)
    print(exp)
    df_ens_avg = 0

    filelist = pangeo_data[(pangeo_data['model'] ==esmname) & (pangeo_data['experiment'] == exp)].copy()

    if filelist.empty:
        print('no ensemble members for this exp')
        df_shaped =  pd.DataFrame({'esm':[esmname]})
        df_shaped['experiment'] = exp
        df_shaped['ens_avg']= -999
        df_shaped['ens_avg_iasd'] = -999
        df_shaped['ens_avg_sd'] = -999
        # end if no files for experiment

    if not filelist.empty:
        df_sum = 0
        df_iav_sum = 0
        df_sd_sum = 0
        n_good_files = 0

        for i in range(len(filelist)):
            print(i)

            # Load data:
            x = stitches.fx_pangeo.fetch_nc(filelist.iloc[i].zstore)
            x = x.sortby('time').copy()


            # If the experiment is historical, further slice to reference years.
            # Otherwise, slice to comparison years:
            if (exp == 'historical'):
                if(esmname == 'UKESM1-0-LL'):
                    x = x.sel(time=slice(ref_start, '2014-12-30')).copy()
                if(esmname != 'UKESM1-0-LL'):
                    x = x.sel(time=slice(ref_start, ref_end)).copy()

            if(exp!='historical'):
                if(esmname == 'UKESM1-0-LL'):
                    x = x.sel(time=slice(comp_start, '2099-12-30')).copy()
                if(esmname != 'UKESM1-0-LL'):
                    x = x.sel(time=slice(comp_start, comp_end)).copy()

                # end if checks for time slicing

            # Check if there are the correct number of time steps in this
            # sliced data:
            # Very rough QC for checking complete netcdfs and assumes
            # comparison window and reference window same length.
            if (len(x.time) >= 12*window_length):
                # coerce to DF so we can properly lat weight to do spatial aggregation:
                x1 = x[varname].to_dataframe().dropna().reset_index().reset_index(drop=True).copy()

                # spatial aggregation:
                monthly_aoi = pd.DataFrame()
                for name, group in x1.groupby('time'):
                    lat = group['lat']
                    area = np.cos(np.deg2rad(lat))
                    df = pd.DataFrame({'time': group['time'].drop_duplicates()})
                    df['aggregate'] = sum(area * group[varname])/sum(area)
                    monthly_aoi = pd.concat([monthly_aoi, df]).reset_index(drop=True).copy()
                    del(df)
                    del(area)
                    del(lat)
                    # end for loop over months to do spatial disaggregation

                # time average for this ensemble member:
                monthly_aoi['year'] = monthly_aoi['time'].apply(lambda x: x.year).copy()
                annual_aoi = pd.DataFrame({'year': monthly_aoi['year'].drop_duplicates(),
                                           'ann_agg':monthly_aoi.groupby('year')['aggregate'].mean().values}).reset_index(drop=True).copy()
                aoi_val = annual_aoi['ann_agg'].mean()
                aoi_sd = annual_aoi['ann_agg'].std()
                aoi_ia_sd = annual_aoi.diff()['ann_agg'].std()
                del(annual_aoi)


                # and add it to the running sum for the ensemble members
                df_sum = (aoi_val  + df_sum)
                df_iav_sum = (aoi_ia_sd + df_iav_sum)
                df_sd_sum = (aoi_sd + df_sd_sum)
                n_good_files = n_good_files + 1

                # end check if is complete data file and subsequent aggregations

            # end for loop over file list

        # Calculate the ensemble average of CONUS 20 year average precip for this
        # experiment
        df_shaped =  pd.DataFrame({'esm':[esmname]})
        df_shaped['experiment'] = exp
        df_shaped['ens_avg']= df_sum/n_good_files
        df_shaped['ens_avg_iasd'] = df_iav_sum/n_good_files
        df_shaped['ens_avg_sd'] = df_sd_sum/n_good_files
        del(df_sum)
        del(df_iav_sum)
        del(df_sd_sum)
        del(n_good_files)
        # end if file list not empty


    # and append to the pr holding data frame
    df_shaped['var'] = varname
    holder = pd.concat([holder, df_shaped]).reset_index(drop=True).copy()
    del(filelist)
    del(df_shaped)
    # end loop over experiments
# end loop over esms


CAMS-CSM1-0
historical
0
CAMS-CSM1-0
ssp126
0
CAMS-CSM1-0
ssp245
0
CAMS-CSM1-0
ssp370
0
CAMS-CSM1-0
ssp585
0
CAMS-CSM1-0
ssp460
no ensemble members for this exp
CAMS-CSM1-0
ssp119
0
CAMS-CSM1-0
ssp434
no ensemble members for this exp
CAMS-CSM1-0
ssp534-over
no ensemble members for this exp
MIROC6
historical
0
MIROC6
ssp126
0
MIROC6
ssp245
0
MIROC6
ssp370
0
MIROC6
ssp585
0
MIROC6
ssp460
0
MIROC6
ssp119
0
MIROC6
ssp434
0
MIROC6
ssp534-over
0
GFDL-ESM4
historical
0
GFDL-ESM4
ssp126
0
GFDL-ESM4
ssp245
0
GFDL-ESM4
ssp370
0
GFDL-ESM4
ssp585
0
GFDL-ESM4
ssp460
no ensemble members for this exp
GFDL-ESM4
ssp119
0
GFDL-ESM4
ssp434
no ensemble members for this exp
GFDL-ESM4
ssp534-over
no ensemble members for this exp
FGOALS-g3
historical
0
FGOALS-g3
ssp126
0
FGOALS-g3
ssp245
0
FGOALS-g3
ssp370
0
FGOALS-g3
ssp585
0
FGOALS-g3
ssp460
0
FGOALS-g3
ssp119
0
FGOALS-g3
ssp434
0
FGOALS-g3
ssp534-over
0
MPI-ESM1-2-HR
historical
0
MPI-ESM1-2-HR
ssp126
0
MPI-ESM1-2-HR
ssp245
0
MPI-ESM1-2-HR
ssp370
0
MPI-ESM

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


ACCESS-ESM1-5
historical
0
ACCESS-ESM1-5
ssp126
0


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


ACCESS-ESM1-5
ssp245
no ensemble members for this exp
ACCESS-ESM1-5
ssp370
no ensemble members for this exp
ACCESS-ESM1-5
ssp585
0


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


ACCESS-ESM1-5
ssp460
no ensemble members for this exp
ACCESS-ESM1-5
ssp119
no ensemble members for this exp
ACCESS-ESM1-5
ssp434
no ensemble members for this exp
ACCESS-ESM1-5
ssp534-over
no ensemble members for this exp
IPSL-CM6A-LR
historical
0
IPSL-CM6A-LR
ssp126
0
IPSL-CM6A-LR
ssp245
0
IPSL-CM6A-LR
ssp370
0
IPSL-CM6A-LR
ssp585
0
IPSL-CM6A-LR
ssp460
0
IPSL-CM6A-LR
ssp119
0
IPSL-CM6A-LR
ssp434
0
IPSL-CM6A-LR
ssp534-over
0


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


CESM2-WACCM
historical
0
CESM2-WACCM
ssp126
0
CESM2-WACCM
ssp245
0
CESM2-WACCM
ssp370
0
CESM2-WACCM
ssp585
0
CESM2-WACCM
ssp460
no ensemble members for this exp
CESM2-WACCM
ssp119
no ensemble members for this exp
CESM2-WACCM
ssp434
no ensemble members for this exp
CESM2-WACCM
ssp534-over
0
UKESM1-0-LL
historical
0
UKESM1-0-LL
ssp126
0
UKESM1-0-LL
ssp245
0
UKESM1-0-LL
ssp370
0
UKESM1-0-LL
ssp585
0
UKESM1-0-LL
ssp460
no ensemble members for this exp
UKESM1-0-LL
ssp119
0
UKESM1-0-LL
ssp434
0
UKESM1-0-LL
ssp534-over
0
CanESM5
historical
0
CanESM5
ssp126
0
CanESM5
ssp245
0
CanESM5
ssp370
0
CanESM5
ssp585
0
CanESM5
ssp460
0
CanESM5
ssp119
0
CanESM5
ssp434
0
CanESM5
ssp534-over
0


In [6]:
holder.to_csv(('global_'+ varname+ '_allesms_r1_2015_2100.csv'), index=False)