# Compute global means for data missing in the cloud by downloading data with wget scripts

In [1]:
from pyesgf.search import SearchConnection
from pyesgf.logon import LogonManager
import importlib
from global_annual_means import *
import json
import os
import shutil
import xarray as xr
import tempfile
import subprocess
import datetime

conn = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True, expire_after = datetime.timedelta(0, 10*3600)) 
conn

<pyesgf.search.connection.SearchConnection at 0x7f82ce152cc0>

In [None]:
lm = LogonManager()
#lm.logoff()
lm.is_logged_on()

In [None]:
lm.logon_with_openid(openid='https://esgf-node.llnl.gov/esgf-idp/openid/hegebeate', password=None)
lm.is_logged_on()

In [None]:
experiment_list = ['piControl', 'abrupt-4xCO2', '1pctCO2',\
                   'abrupt-2xCO2', 'abrupt-0p5xCO2',\
                   'historical', 'hist-GHG', 'hist-nat', 'hist-aer',\
                  'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp585',\
                  'piClim-control', 'piClim-4xCO2', 'piClim-histall']

variable_list = ['tas','rlut','rsut','rsdt']

In [None]:
experiment_list[11]

In [None]:
# files are intially downloaded to the temporary directory where the wget scripts end up:
download_dir = '/var/folders/53/fvz76zt55t3fbyz2bxr_l6540000gn/T'
# then moved later

for exp in experiment_list:
#for exp in [experiment_list[11]]:
    with open('../Data_availability/GoogleCloud/missing-'+exp+'.json', 'r') as f:
         data_dict = json.load(f)
    #with open('../Data_availability/GoogleCloud/cloud-buterrors.json', 'r') as f:
    #     data_dict = json.load(f)
    #if exp in data_dict:
    #    data_dict = data_dict[exp]
    #    print(exp, data_dict)
    #else:
    #    continue
    print(exp, data_dict)
    
    #for model in ['EC-Earth3']:
    for model in data_dict:
        #for member in ['r20i1p1f1']:
        for member in data_dict[model]:
            filename = model + '_' + exp + '_' + str(member) + '_means.csv'
            filepath = os.path.join('../Processed_data/Global_annual_means_csv/', model, exp)
            exp_model_dir = download_dir + '/CMIP6_data/' + exp + '_data/' + model
            # check if file exists already:
            if os.path.isfile(filepath + '/' + filename):
                print(exp, model, member, 'global means already exist, so data will not be downloaded')
                continue
            elif os.path.isdir(exp_model_dir):
                print('Folder for putting files in is already created.')
                filesindir = [f.name for f in os.scandir(exp_model_dir) if f.name[-3:] =='.nc']
                member_files = [file for file in filesindir if member in file]
                if len(member_files) > 0:
                    print(len(member_files), 'files for', exp, model, member, 'exist')
                    counters = {}
                    for var in variable_list:
                        counters[var] = 0
                        for file in member_files: 
                            if var in file:
                                counters[var] += 1
                    print(counters)
                    #print('New files will not be downloaded')
                    #continue
                else:
                    print('No files for', exp, model, member, 'exist, so they will be downloaded') 
            else:
                print(exp, model, member)

            #############
            # check if files are already downloaded:
            filestr = 'Amon_'+ model + '_' + exp + '_' + member
            #if filecheck(filestr, all_ncfiles) is not False:

            #ctx = conn.new_context(project='CMIP6', table_id = 'Amon',\
            #       latest=True, source_id = model, replica = False,\
            #       experiment_id=exp, variable=variable_list, variant_label = member)
            ctx = conn.new_context(project='CMIP6', table_id = 'Amon',\
                   #latest=True, source_id = model, replica = True, data_node = 'esgf.nci.org.au',\
                   latest=True, source_id = model, data_node = 'esgf.nci.org.au',\
                    experiment_id=exp, variable=variable_list, variant_label = member)

            wget_script_content = ctx.get_download_script() 
            script_path = tempfile.mkstemp(suffix='.sh', prefix='download-')[1]
            with open(script_path, "w") as writer:
                writer.write(wget_script_content)
            print(script_path)

            # running the script here seems to be slower than running it in the terminal..

            #os.chmod(script_path, 0o750)
            #download_dir = os.path.dirname(script_path)
            #print(download_dir)
            #subprocess.check_output("{}".format(script_path), cwd=download_dir)

            #############


## Move files to subdirectories

In [None]:
download_dir = '/var/folders/53/fvz76zt55t3fbyz2bxr_l6540000gn/T'

ncfiles = [f.name for f in os.scandir(download_dir) if f.name[-3:] =='.nc']
for file in ncfiles:
    filename_parts = file.rsplit("_")
    model = filename_parts[2]; exp = filename_parts[3];
    subdir = 'CMIP6_data/' + exp + '_data/' + model
    if os.path.isdir(download_dir + '/' + subdir) == False:
        os.makedirs(download_dir + '/' + subdir)
    shutil.move(download_dir + '/' + file, download_dir + '/' + subdir + '/' + file)

## Make global average of data where averages do not exist yet

In [None]:
experiment_list[5]

In [None]:
download_dir = '/var/folders/53/fvz76zt55t3fbyz2bxr_l6540000gn/T'

#for exp in [experiment_list[5]]:
for exp in experiment_list[1:]:
    with open('../Data_availability/GoogleCloud/missing-'+exp+'.json', 'r') as f:
         data_dict = json.load(f)
    #with open('../Data_availability/GoogleCloud/cloud-buterrors.json', 'r') as f:
    #     data_dict = json.load(f)
    #if exp in data_dict:
    #    data_dict = data_dict[exp]
    #    print(exp, data_dict)
    #else:
    #    continue
    #print(exp, data_dict)
    for model in data_dict:
        #if model in ['EC-Earth3']:
        if model in ['ICON-ESM-LR']: 
            for member in data_dict[model]:
                if missing_esgf_test(exp, model, member) == True:
                    continue
                #if member in ['r3i1p1f1', 'r20i1p1f1']:
                #    continue
                filename = model + '_' + exp + '_' + str(member) + '_means.csv'
                filepath = os.path.join('../Processed_data/Global_annual_means_csv/', model, exp)
                exp_model_dir = download_dir + '/CMIP6_data/' + exp + '_data/' + model
                
                # check if files exists already:
                if os.path.isdir(exp_model_dir):
                    model_exp_files = [f.name for f in os.scandir(exp_model_dir) if f.name[-3:] =='.nc']
                    member_files = [file for file in model_exp_files if member in file]
                    member_paths = [exp_model_dir + '/' + file for file in member_files]
                    if len(member_files) > 0:

                        counters = {}
                        for var in variable_list:
                            counters[var] = 0
                            for file in member_files: 
                                if var in file:
                                    counters[var] += 1
                        print(len(member_files), 'files for', exp, model, member, 'exist:', counters)

                    if os.path.isfile(filepath + '/' + filename):
                        print(exp, model, member, 'global means already exist, so we will not make new averages')
                        continue

                    # perform averaging if we have any files to average:
                    if len(member_paths)>0:
                        print(exp, model, member, ': global averaging will be performed')
                        first_ds = xr.open_dataset(member_paths[0])
                        ds_calendar = first_ds.time.encoding['calendar']
                        print(ds_calendar)

                        #if member in sepvar_dict_downloadeddata[exp][model]:
                        #    avg_df = global_average_sepvar_downloadeddata(exp, model, member, member_paths, ds_calendar)
                        #else:
                        ds = xr.open_mfdataset(member_paths, combine='by_coords', join = 'exact', concat_dim='time', parallel = True, preprocess = preprocess, use_cftime = True)
                        # join = 'override' used for:
                        # 1pctCO2 EC-Earth3 r3i1p1f1
                        # beacause of different coordinate values
                        
                        #ds = ds.drop_vars('time_bnds')
                        print(ds.coords)
                        avg_df = global_averaging(ds, calendar = ds_calendar)

                        # save dataframe as csv:
                        if os.path.isdir(filepath) == False:
                            os.makedirs(filepath)
                        avg_df.to_csv(filepath + '/' + filename)


In [None]:
avg_df