# curating  data for global calculations

# Import general packages
- if it is the first time running this notebook, will need to set up environment ->
locally I'm just using my stitches interpreter

In [1]:
import stitches as stitches

import statsmodels.api as sm
import pandas as pd
import pkg_resources
import xarray as xr
import numpy as np
import seaborn as sns

# Plotting options
sns.set(font_scale=1.3)
sns.set_style("white")
# For help with plotting
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 12, 6
pd.set_option('display.max_columns', None)

In [2]:
# Time slices
ref_start = '1995-01-01'
ref_end =  '2014-12-31'

comp_start = '2015-01-01'
comp_end =  '2099-12-31'

window_length = 20


# specify ESMs, variables, experiments

In [3]:
# The CMIP6 ESM we want to emulate and the variables we want to
# emulate
# NOTE IPSL and GFDL submitted results under grids labeled not `gn` so they
# are not included in the stitches patches data. To pull their ESMs, we have to
# source the pangeo table directly from pangeo and reshape it instead of using
# the stitches package data.


esm = ['CAMS-CSM1-0', 'MIROC6', 'GFDL-ESM4', 'FGOALS-g3',
'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0',
'ACCESS-ESM1-5', 'IPSL-CM6A-LR', 'CESM2-WACCM',
'UKESM1-0-LL',
'CanESM5']

vars1 = ['tas']

exps = ['historical','ssp126', 'ssp245', 'ssp370',  'ssp585',
        'ssp460', 'ssp119',   'ssp434', 'ssp534-over']

# Pull pangeo dataframe with netcdf addresses for above

In [4]:
# pangeo table of ESMs for reference
pangeo_data = stitches.fx_pangeo.fetch_pangeo_table()

pangeo_data = pangeo_data[(pangeo_data['source_id'].isin(esm)) &
                           (pangeo_data['variable_id'].isin(vars1)) &(pangeo_data['table_id'] == 'Amon')&
                           ((pangeo_data['experiment_id'].isin(exps)))].copy()

# reshape to look like package data but with the ESMs we want to include
pangeo_data = pangeo_data[["source_id", "experiment_id", "member_id", "variable_id", "grid_label",
                                                        "zstore", "table_id"]].copy()
pangeo_data = pangeo_data.rename(columns={"source_id": "model", "experiment_id": "experiment",
                                                "member_id": "ensemble", "variable_id": "variable",
                                                "zstore": "zstore", "table_id": "domain"}).reset_index(drop = True).copy()

# keep only p1 runs:
# UK model only does f2 runs for some reason
ukesm_data =  pangeo_data[pangeo_data['model'].str.contains('UKESM')].copy()
ukesm_data = ukesm_data[ukesm_data['ensemble'].str.contains('i1p1f2')].copy()

# everyone else does f1 runs
pangeo_data = pangeo_data[pangeo_data['ensemble'].str.contains('i1p1f1')].copy()

# combine UKESM with other models
pangeo_data = pd.concat([pangeo_data, ukesm_data]).reset_index(drop=True).copy()

# # Ensemble member 1 only:
# pangeo_data = pangeo_data[pangeo_data['ensemble'].str.contains('r1i1')].copy()

# loop over files and do calculations

In [6]:
varname = vars1[0]

holder = pd.DataFrame()
timeseries_holder = pd.DataFrame()

for esmname in esm[0:1]:
  for exp in exps[0:1]:

    print(esmname)
    print(exp)
    df_ens_avg = 0

    filelist = pangeo_data[(pangeo_data['model'] ==esmname) & (pangeo_data['experiment'] == exp)].copy()

    if filelist.empty:
        print('no ensemble members for this exp')
        df_shaped =  pd.DataFrame({'esm':[esmname]})
        df_shaped['experiment'] = exp
        df_shaped['ens_avg']= -999
        df_shaped['ens_avg_iasd'] = -999
        df_shaped['ens_avg_sd'] = -999
        # end if no files for experiment

    if not filelist.empty:
        df_sum = 0
        df_iav_sum = 0
        df_sd_sum = 0
        n_good_files = 0

        for i in [0]: #range(len(filelist)):
            print(i)

            # Load data:
            x = stitches.fx_pangeo.fetch_nc(filelist.iloc[i].zstore)
            x = x.sortby('time').copy()


            # If the experiment is historical, further slice to reference years.
            # Otherwise, slice to comparison years:
            if (exp == 'historical'):
                if(esmname == 'UKESM1-0-LL'):
                    x = x.sel(time=slice(ref_start, '2014-12-30')).copy()
                if(esmname != 'UKESM1-0-LL'):
                    x = x.sel(time=slice(ref_start, ref_end)).copy()

            if(exp!='historical'):
                if(esmname == 'UKESM1-0-LL'):
                    x = x.sel(time=slice(comp_start, '2099-12-30')).copy()
                if(esmname != 'UKESM1-0-LL'):
                    x = x.sel(time=slice(comp_start, comp_end)).copy()

                # end if checks for time slicing

CAMS-CSM1-0
historical
0


In [21]:
x1 = x[varname].to_dataframe().dropna().reset_index().reset_index(drop=True).copy()

for name, group in x1.groupby('time'):
    lat = group['lat']
    area = np.cos(np.deg2rad(lat))
    weight = area.copy()
    weight /= weight.mean()

    test1 = sum(area * group[varname])/sum(area)
    test2 = (weight *group[varname]).mean()
    if(abs(test2-test1) >= 5e-12):
        print('=============================')
        print(group.time.drop_duplicates())
        print(test2-test1)
    # print('=============================')
    #
    # print(group.time.drop_duplicates())
    # print(test2-test1)
    # print('=============================')


819200    1996-05-16 12:00:00
Name: time, dtype: object
5.5138116294983774e-12
1433600    1997-05-16 12:00:00
Name: time, dtype: object
-6.764366844436154e-12
1894400    1998-02-15 00:00:00
Name: time, dtype: object
7.219114195322618e-12
2662400    1999-05-16 12:00:00
Name: time, dtype: object
6.195932655828074e-12
4966400    2003-02-15 00:00:00
Name: time, dtype: object
6.139089236967266e-12
5580800    2004-02-15 00:00:00
Name: time, dtype: object
5.8548721426632255e-12
7731200    2007-08-16 12:00:00
Name: time, dtype: object
5.115907697472721e-12
7833600    2007-10-16 12:00:00
Name: time, dtype: object
5.5138116294983774e-12
8089600    2008-03-16 12:00:00
Name: time, dtype: object
5.6843418860808015e-12
9113600    2009-11-16 00:00:00
Name: time, dtype: object
5.5138116294983774e-12
9216000    2010-01-16 12:00:00
Name: time, dtype: object
-5.002220859751105e-12
10137600    2011-07-16 12:00:00
Name: time, dtype: object
-5.9117155615240335e-12
10905600    2012-10-16 12:00:00
Name: time,

In [14]:
lat = group['lat']
area = np.cos(np.deg2rad(lat))
weight = area.copy()
weight /= weight.mean()

test1 = sum(area * group[varname])/sum(area)
test2 = (weight *group[varname]).mean()


In [17]:
test2 - test1

-2.5011104298755527e-12

In [None]:
            # Check if there are the correct number of time steps in this
            # sliced data:
            # Very rough QC for checking complete netcdfs and assumes
            # comparison window and reference window same length.
            if (len(x.time) >= 12*window_length):
                # coerce to DF so we can properly lat weight to do spatial aggregation:
                x1 = x[varname].to_dataframe().dropna().reset_index().reset_index(drop=True).copy()

                # spatial aggregation:
                monthly_aoi = pd.DataFrame()
                for name, group in x1.groupby('time'):
                    lat = group['lat']
                    area = np.cos(np.deg2rad(lat))
                    df = pd.DataFrame({'time': group['time'].drop_duplicates()})
                    df['aggregate'] = sum(area * group[varname])/sum(area)
                    monthly_aoi = pd.concat([monthly_aoi, df]).reset_index(drop=True).copy()

In [5]:
varname = vars1[0]

holder = pd.DataFrame()
timeseries_holder = pd.DataFrame()

for esmname in esm:
  for exp in exps:

    print(esmname)
    print(exp)
    df_ens_avg = 0

    filelist = pangeo_data[(pangeo_data['model'] ==esmname) & (pangeo_data['experiment'] == exp)].copy()

    if filelist.empty:
        print('no ensemble members for this exp')
        df_shaped =  pd.DataFrame({'esm':[esmname]})
        df_shaped['experiment'] = exp
        df_shaped['ens_avg']= -999
        df_shaped['ens_avg_iasd'] = -999
        df_shaped['ens_avg_sd'] = -999
        # end if no files for experiment

    if not filelist.empty:
        df_sum = 0
        df_iav_sum = 0
        df_sd_sum = 0
        n_good_files = 0

        for i in range(len(filelist)):
            print(i)

            # Load data:
            x = stitches.fx_pangeo.fetch_nc(filelist.iloc[i].zstore)
            x = x.sortby('time').copy()


            # If the experiment is historical, further slice to reference years.
            # Otherwise, slice to comparison years:
            if (exp == 'historical'):
                if(esmname == 'UKESM1-0-LL'):
                    x = x.sel(time=slice(ref_start, '2014-12-30')).copy()
                if(esmname != 'UKESM1-0-LL'):
                    x = x.sel(time=slice(ref_start, ref_end)).copy()

            if(exp!='historical'):
                if(esmname == 'UKESM1-0-LL'):
                    x = x.sel(time=slice(comp_start, '2099-12-30')).copy()
                if(esmname != 'UKESM1-0-LL'):
                    x = x.sel(time=slice(comp_start, comp_end)).copy()

                # end if checks for time slicing

            # Check if there are the correct number of time steps in this
            # sliced data:
            # Very rough QC for checking complete netcdfs and assumes
            # comparison window and reference window same length.
            if (len(x.time) >= 12*window_length):
                # coerce to DF so we can properly lat weight to do spatial aggregation:
                x1 = x[varname].to_dataframe().dropna().reset_index().reset_index(drop=True).copy()

                # spatial aggregation:
                monthly_aoi = pd.DataFrame()
                for name, group in x1.groupby('time'):
                    lat = group['lat']
                    area = np.cos(np.deg2rad(lat))
                    df = pd.DataFrame({'time': group['time'].drop_duplicates()})
                    df['aggregate'] = sum(area * group[varname])/sum(area)
                    monthly_aoi = pd.concat([monthly_aoi, df]).reset_index(drop=True).copy()
                    del(df)
                    del(area)
                    del(lat)
                    # end for loop over months to do spatial disaggregation

                # time average for this ensemble member:
                monthly_aoi['year'] = monthly_aoi['time'].apply(lambda x: x.year).copy()
                annual_aoi = pd.DataFrame({'year': monthly_aoi['year'].drop_duplicates(),
                                           'ann_agg':monthly_aoi.groupby('year')['aggregate'].mean().values}).reset_index(drop=True).copy()
                aoi_val = annual_aoi['ann_agg'].mean()
                aoi_sd = annual_aoi['ann_agg'].std()
                aoi_ia_sd = annual_aoi.diff()['ann_agg'].std()

                annual_aoi['esm'] = filelist.iloc[i].model
                annual_aoi['experiment'] =  filelist.iloc[i].experiment
                annual_aoi['ensemble'] = filelist.iloc[i].ensemble
                annual_aoi['variable'] = filelist.iloc[i].variable
                timeseries_holder = pd.concat([timeseries_holder, annual_aoi]).reset_index(drop=True).copy()
                del(annual_aoi)


                # and add it to the running sum for the ensemble members
                df_sum = (aoi_val  + df_sum)
                df_iav_sum = (aoi_ia_sd + df_iav_sum)
                df_sd_sum = (aoi_sd + df_sd_sum)
                n_good_files = n_good_files + 1

                # end check if is complete data file and subsequent aggregations

            # end for loop over file list

        # Calculate the ensemble average of CONUS 20 year average precip for this
        # experiment
        df_shaped =  pd.DataFrame({'esm':[esmname]})
        df_shaped['experiment'] = exp
        df_shaped['ens_avg']= df_sum/n_good_files
        df_shaped['ens_avg_iasd'] = df_iav_sum/n_good_files
        df_shaped['ens_avg_sd'] = df_sd_sum/n_good_files
        del(df_sum)
        del(df_iav_sum)
        del(df_sd_sum)
        del(n_good_files)
        # end if file list not empty


    # and append to the pr holding data frame
    df_shaped['var'] = varname
    holder = pd.concat([holder, df_shaped]).reset_index(drop=True).copy()
    del(filelist)
    del(df_shaped)
    # end loop over experiments
# end loop over esms


CAMS-CSM1-0
historical
0
1
CAMS-CSM1-0
ssp126
0
1
CAMS-CSM1-0
ssp245
0
1
CAMS-CSM1-0
ssp370
0
1
CAMS-CSM1-0
ssp585
0
1
CAMS-CSM1-0
ssp460
no ensemble members for this exp
CAMS-CSM1-0
ssp119
0
1
CAMS-CSM1-0
ssp434
no ensemble members for this exp
CAMS-CSM1-0
ssp534-over
no ensemble members for this exp
MIROC6
historical
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
MIROC6
ssp126
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
MIROC6
ssp245
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
MIROC6
ssp370
0
1
2
MIROC6
ssp585
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
MIROC6
ssp460
0
MIROC6
ssp119
0
MIROC6
ssp434
0
MIROC6
ss

  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


ACCESS-ESM1-5
historical
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
ACCESS-ESM1-5
ssp126
0


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


1


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


2


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


3


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


4


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


5


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


6


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


7


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


8


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


9


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
ACCESS-ESM1-5
ssp245
0
1
2
3
4
5
6
7
8
9
ACCESS-ESM1-5
ssp370
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
ACCESS-ESM1-5
ssp585
0


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


1


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


2


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


3


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


4


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


5


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


6


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


7


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


8


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


9


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
ACCESS-ESM1-5
ssp460
no ensemble members for this exp
ACCESS-ESM1-5
ssp119
no ensemble members for this exp
ACCESS-ESM1-5
ssp434
no ensemble members for this exp
ACCESS-ESM1-5
ssp534-over
no ensemble members for this exp
IPSL-CM6A-LR
historical
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
IPSL-CM6A-LR
ssp126
0
1
2
3
4
5
IPSL-CM6A-LR
ssp245
0
1
2
3
4
5
6
7
8
9
10
IPSL-CM6A-LR
ssp370
0
1
2
3
4
5
6
7
8
9
10
IPSL-CM6A-LR
ssp585
0
1
2
3
4
5
IPSL-CM6A-LR
ssp460
0
1
2
3
4
5
IPSL-CM6A-LR
ssp119
0
1
2
3
4
5
IPSL-CM6A-LR
ssp434
0
IPSL-CM6A-LR
ssp534-over
0


  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime)
  return np.asarray(array[self.key], dtype=None)


CESM2-WACCM
historical
0
1
2
CESM2-WACCM
ssp126
0
CESM2-WACCM
ssp245
0
1
2
3
4
CESM2-WACCM
ssp370
0
1
2
CESM2-WACCM
ssp585
0
1
2
3
4
CESM2-WACCM
ssp460
no ensemble members for this exp
CESM2-WACCM
ssp119
no ensemble members for this exp
CESM2-WACCM
ssp434
no ensemble members for this exp
CESM2-WACCM
ssp534-over
0
1
2
UKESM1-0-LL
historical
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
UKESM1-0-LL
ssp126
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
UKESM1-0-LL
ssp245
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
UKESM1-0-LL
ssp370
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
UKESM1-0-LL
ssp585
0
1
2
3
4
UKESM1-0-LL
ssp460
no ensemble members for this exp
UKESM1-0-LL
ssp119
0
1
2
3
4
UKESM1-0-LL
ssp434
0
1
2
3
4
UKESM1-0-LL
ssp534-over
0
1
2
3
4
CanESM5
historical
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
CanESM5
ssp126
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
CanESM5
ssp245
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
CanESM5
ssp370
0
1
2
3
4
5
6


In [8]:
timeseries_holder.to_csv(('extracted_timeseries/global_'+ varname+ '_allesms_timeseries_2015_2100.csv'),index = False)
#holder.to_csv(('global_'+ varname+ '_allesms_2015_2100.csv'), index=False)