# Global averaging of data in Google Cloud

In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import intake
import json
import os
import importlib
import global_annual_means
from global_annual_means import *

experiment_list = ['piControl', 'abrupt-4xCO2', '1pctCO2',\
                   'abrupt-2xCO2', 'abrupt-0p5xCO2',\
                   'historical', 'hist-GHG', 'hist-nat', 'hist-aer',\
                  'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp585',\
                  'piClim-control', 'piClim-4xCO2', 'piClim-histall']

# for some reason this function could not be loaded from a .py file,
# it may have something to do with the function loading of another file
def error_test(exp, model, member):
    with open('../Data_availability/GoogleCloud/cloud-buterrors.json', 'r') as f:
         error_data_dict = json.load(f)
    if exp in error_data_dict:
        if model in error_data_dict[exp]:
            if member in error_data_dict[exp][model]:
                return True

col_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(col_url)
col

Unnamed: 0,unique
activity_id,18
institution_id,36
source_id,88
experiment_id,170
member_id,657
table_id,37
variable_id,700
grid_label,10
zstore,514968
dcpp_init_year,60


In [2]:
experiment_list[5]

'historical'

In [6]:
#importlib.reload(global_annual_means)
#from global_annual_means import *

#for exp in experiment_list:
for exp in [experiment_list[5]]:
    with open('../Data_availability/GoogleCloud/cloud-'+exp+'.json', 'r') as f:
         data_dict = json.load(f)
    #print(exp, data_dict)
    for model in ['EC-Earth3']:
    #for model in data_dict:
        for member in ['r13i1p1f1', 'r15i1p1f1']:
        #for member in data_dict[model]:
            if error_test(exp, model, member) == True:
                print(exp, model, member, 'skipped because it contains errors')
                continue

            filename = model + '_' + exp + '_' + str(member) + '_means.csv'
            filepath = os.path.join('../Processed_data/Global_annual_means/', model, exp)
            # check if file exists already:
            if os.path.isfile(filepath + '/' + filename):
                print(exp, model, member, 'global means already exist, and will therefore not be computed')
            #    continue
            else:
                print(exp, model, member)
            if sepvar_test(exp, model, member) == True:
                # load each variable separately because of an error in the time coordinates
                avg_df = global_average_sepvar(exp, model, member)
            else:
                ds = ds_cloud(exp, model, member)
                avg_df = global_averaging(ds)

            # save dataframe as csv:
            if os.path.isdir(filepath) == False:
                os.makedirs(filepath)
            avg_df.to_csv(filepath + '/' + filename)


historical EC-Earth3 r13i1p1f1 global means already exist, and will therefore not be computed

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


first month of dataset is: 12
tas
rlut
rsut
rsdt
historical EC-Earth3 r15i1p1f1 global means already exist, and will therefore not be computed

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


first month of dataset is: 12
tas
rlut
rsut
rsdt


In [7]:
avg_df

Unnamed: 0_level_0,tas,rlut,rsut,rsdt
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1850,287.024878,241.991483,98.457597,340.282563
1851,287.026583,242.010503,98.253183,340.271688
1852,287.016881,242.000590,97.952380,340.301727
1853,287.260882,242.141159,97.883823,340.250531
1854,287.269304,242.394825,97.847370,340.223418
...,...,...,...,...
2010,288.212442,241.474229,98.304101,340.252580
2011,288.241259,241.470862,98.262291,340.319857
2012,288.238787,241.342817,97.858688,340.379604
2013,288.295173,241.496770,98.193031,340.363820


## Check if there are missing time periods in any of the datasets:

In [7]:
from processing_functions import *
experiments = make_exp_dict()
#model_names = [key for key in experiments.keys()]

In [8]:
nandata_list = []
for model in experiments.keys():
    for exp in experiments[model]:
        exp_path = '../Processed_data/Global_annual_means/' + model + '/' + exp + '/'
        for member in experiments[model][exp]:
            # load exp data
            exp_filename = model + '_' + exp + '_' + member + '_means.csv'
            exp_data = pd.read_table(exp_path + exp_filename, index_col=0, sep = ',')
            if np.isnan(exp_data).values.any():
                nandata_list.append([model, exp, member])
                print(model, exp, member, 'data contain', np.isnan(exp_data).values.sum(), 'NaN values')
                

AWI-CM-1-1-MR historical r1i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r2i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r3i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r4i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r5i1p1f1 data contain 1 NaN values
EC-Earth3-Veg piControl r1i1p1f1 data contain 3000 NaN values
EC-Earth3-Veg piControl r1i1p1f1 data contain 3000 NaN values
FIO-ESM-2-0 piControl r1i1p1f1 data contain 325 NaN values


After first run through all data in the cloud: (most of these are now downloaded and averaged)

ACCESS-ESM1-5 piControl r1i1p1f1 data contain 200 NaN values
AWI-CM-1-1-MR historical r1i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r2i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r3i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r4i1p1f1 data contain 1 NaN values
AWI-CM-1-1-MR historical r5i1p1f1 data contain 1 NaN values
CAS-ESM2-0 piControl r1i1p1f1 data contain 151 NaN values
CAS-ESM2-0 abrupt-4xCO2 r1i1p1f1 data contain 34 NaN values
CESM2-WACCM ssp585 r1i1p1f1 data contain 199 NaN values
CESM2-WACCM ssp126 r1i1p1f1 data contain 597 NaN values
CIESM ssp585 r1i1p1f1 data contain 344 NaN values
EC-Earth3-Veg piControl r1i1p1f1 data contain 3000 NaN values
FGOALS-g3 historical r5i1p1f1 data contain 117 NaN values
FGOALS-g3 historical r6i1p1f1 data contain 284 NaN values
FIO-ESM-2-0 piControl r1i1p1f1 data contain 325 NaN values
GFDL-CM4 abrupt-4xCO2 r1i1p1f1 data contain 280 NaN values
HadGEM3-GC31-LL piControl r1i1p1f1 data contain 3000 NaN values
MIROC6 ssp245 r21i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r22i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r23i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r24i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r25i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r26i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r27i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r28i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r29i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r30i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r31i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r32i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r33i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r34i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r35i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r36i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r37i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r38i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r39i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r40i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r41i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r42i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r43i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r44i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r45i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r46i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r47i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r48i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r49i1p1f1 data contain 183 NaN values
MIROC6 ssp245 r50i1p1f1 data contain 183 NaN values
MPI-ESM-1-2-HAM piControl r1i1p1f1 data contain 440 NaN values
MRI-ESM2-0 piClim-control r1i1p1f1 data contain 3 NaN values
MRI-ESM2-0 ssp245 r2i1p1f1 data contain 210 NaN values
MRI-ESM2-0 ssp245 r3i1p1f1 data contain 210 NaN values
MRI-ESM2-0 ssp245 r4i1p1f1 data contain 210 NaN values
MRI-ESM2-0 ssp245 r5i1p1f1 data contain 210 NaN values
MRI-ESM2-0 piClim-4xCO2 r1i1p1f1 data contain 3 NaN values
NorCPM1 historical r11i1p1f1 data contain 210 NaN values
NorCPM1 historical r12i1p1f1 data contain 210 NaN values
NorCPM1 historical r13i1p1f1 data contain 210 NaN values
NorCPM1 historical r14i1p1f1 data contain 360 NaN values
NorCPM1 historical r15i1p1f1 data contain 210 NaN values
NorCPM1 historical r16i1p1f1 data contain 210 NaN values
NorCPM1 historical r17i1p1f1 data contain 210 NaN values
NorCPM1 historical r19i1p1f1 data contain 360 NaN values
NorCPM1 historical r1i1p1f1 data contain 210 NaN values
NorCPM1 historical r21i1p1f1 data contain 360 NaN values
NorCPM1 historical r22i1p1f1 data contain 210 NaN values
NorCPM1 historical r24i1p1f1 data contain 360 NaN values
NorCPM1 historical r25i1p1f1 data contain 210 NaN values
NorCPM1 historical r26i1p1f1 data contain 360 NaN values
NorCPM1 historical r27i1p1f1 data contain 210 NaN values
NorCPM1 historical r28i1p1f1 data contain 360 NaN values
NorCPM1 historical r29i1p1f1 data contain 510 NaN values
NorCPM1 historical r2i1p1f1 data contain 360 NaN values
NorCPM1 historical r30i1p1f1 data contain 360 NaN values
NorCPM1 historical r3i1p1f1 data contain 210 NaN values
NorCPM1 historical r4i1p1f1 data contain 210 NaN values
NorCPM1 historical r5i1p1f1 data contain 360 NaN values
NorCPM1 historical r7i1p1f1 data contain 210 NaN values
UKESM1-0-LL ssp585 r4i1p1f2 data contain 400 NaN values
UKESM1-0-LL ssp245 r10i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r11i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r12i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r16i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r17i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r18i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r19i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r5i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r6i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r7i1p1f2 data contain 162 NaN values
UKESM1-0-LL ssp245 r9i1p1f2 data contain 162 NaN values

In [4]:
unique_models = list(set([nandata_list[i][0] for i in range(len(nandata_list))]))
unique_models.sort()
unique_models

['ACCESS-ESM1-5',
 'AWI-CM-1-1-MR',
 'CAS-ESM2-0',
 'CESM2-WACCM',
 'CIESM',
 'EC-Earth3-Veg',
 'FGOALS-g3',
 'FIO-ESM-2-0',
 'GFDL-CM4',
 'HadGEM3-GC31-LL',
 'MIROC6',
 'MPI-ESM-1-2-HAM',
 'MRI-ESM2-0',
 'NorCPM1',
 'UKESM1-0-LL']

## The following did not seem to improve anything. Instead, many datasets have been downloaded instead

In [5]:
for model in [unique_models[10]]:
#for model in unique_models:
    for exp in experiments[model]:
        exp_path = '../Processed_data/Global_annual_means/' + model + '/' + exp + '/'
        for member in experiments[model][exp]:
            # load exp data
            exp_filename = model + '_' + exp + '_' + member + '_means.csv'
            exp_data = pd.read_table(exp_path + exp_filename, index_col=0, sep = ',')
            if np.isnan(exp_data).values.any():
                print(model, exp, member, 'data contain NaN values')
                
                # recompute and check if data have been added later:
                if sepvar_test(exp, model, member) == True:
                    # load each variable separately because of an error in the time coordinates
                    avg_df = global_average_sepvar(exp, model, member)
                else:
                    ds = ds_cloud(exp, model, member)
                    avg_df = global_averaging(ds)
                if np.isnan(avg_df).values.any():
                    print(model, exp, member, 'still contain NaN values after recomputing')
                else:
                    print('no longer missing time periods for:', model, exp, member)
                    if len(avg_df) >= len(exp_data):
                        # save dataframe as csv:
                        avg_df.to_csv(exp_path + '/' + exp_filename)
                        print('Global means are replaced')
                    else:
                        print('New data is shorter than the old, and therefore not replaced')
                    
                

MIROC6 ssp245 r21i1p1f1 data contain NaN values

--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


first month of dataset is: 1
tas
rlut


KeyError: 'rlut'

In [None]:
# For FIO-ESM-2-0 tas is from a different version of the dataset than the other variables.
# which has a differen time period. This is the same as I find in ESGF.


# EC-Earth3-Veg: obtaing the last 1000 years of the 2000 year long piControl
# is a problem, since wget scripts can only be generated for 1000 files


# NorCPM1 historical: many members contain also data that seems to belong in historical-ext instead (for years 2015-2029)