# Compare data availability in ESGF and google cloud

In [1]:
import intake
import os
import json

col_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(col_url)

In [2]:
esgf_dir = '../Processed_data/Data_availability/ESGF/'
files = [ f.name for f in os.scandir(esgf_dir) if f.name[-12:] != '_models.json']
experiments = [file[5:-5] for file in files]
print(experiments)

['ssp126', 'abrupt-2xCO2', 'ssp370', 'piControl', 'hist-aer', 'piClim-histall', 'ssp585', 'abrupt-4xCO2', 'ssp245', 'piClim-4xCO2', 'ssp119', 'historical', '1pctCO2', 'abrupt-0p5xCO2', 'piClim-control', 'hist-GHG', 'hist-nat']


In [3]:

var_list = ['tas', 'rlut', 'rsut', 'rsdt']

for experiment in experiments:
#for experiment in [experiments[0]]:
# load file of available data in esgf:
    with open('../Processed_data/Data_availability/ESGF/esgf-'+experiment+'.json', 'r') as f:
        exp_dict = json.load(f)
    # narrow search in google cloud:
    col_exp = col.search(experiment_id = experiment, variable_id=var_list, table_id='Amon')
    
    data_dict = {}; missing_dict = {}
    #for model in ['GISS-E2-1-H']:
    for model in exp_dict:
        if len(exp_dict[model])>0:
            members = exp_dict[model]
            col_model = col_exp.search(source_id = model)

            missing_members = []; cloud_members = []
            if len(members) == 0:
                print(experiment, model)
            for member in members:
                col_member = col_model.search(member_id = member)
                unique_vars = col_member.df['variable_id'].unique()
                cloud_versions = col_member.df['version'].unique()
                latest_esgf_version = [int(version[1:]) for version in exp_dict[model][member]]; 

                if len(unique_vars) < len(var_list):
                    missing_vars = set(var_list) - set(unique_vars)
                    missing_members.append(member)
                    print('Missing data:', experiment, model, member, 'is missing the variables', missing_vars)
                else:
                    # check if latest version(s) exist in cloud:
                    version_check = set(latest_esgf_version).issubset(set(cloud_versions))
                    if version_check == True:
                        cloud_members.append(member)
                    else:
                        missing_members.append(member)
                        print('Warning: only older version exist in the cloud for ', experiment, model, member)
                    #print(model, member, latest_esgf_version, cloud_versions, version_check)

            if len(cloud_members)>0:
                data_dict[model] = cloud_members;
            if len(missing_members)>0:
                missing_dict[model] = missing_members

    # write dictionaries to files:
    with open('../Processed_data/Data_availability/GoogleCloud/cloud-'+experiment+'.json', 'w') as f:
         json.dump(data_dict, f, indent = 2)
    with open('../Processed_data/Data_availability/GoogleCloud/missing-'+experiment+'.json', 'w') as f:
         json.dump(missing_dict, f, indent = 2)


Missing data: ssp126 ACCESS-CM2 r4i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-CM2 r5i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r10i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r11i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r12i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r13i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r14i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r15i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r16i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r17i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 ACCESS-ESM1-5 r18i1p1f1 is missing th

  warn(message)


Missing data: ssp126 GISS-E2-1-G r101i1p1f1 is missing the variables {'rsdt', 'tas', 'rlut', 'rsut'}
Missing data: ssp126 GISS-E2-1-G r1i1p1f2 is missing the variables {'rsdt', 'tas', 'rlut', 'rsut'}
Missing data: ssp126 GISS-E2-1-G r2i1p1f2 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 GISS-E2-1-G r3i1p1f2 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 GISS-E2-1-G r4i1p1f2 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 GISS-E2-1-G r5i1p1f2 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 IPSL-CM5A2-INCA r1i1p1f1 is missing the variables {'rsut', 'rlut', 'rsdt'}
Missing data: ssp126 KIOST-ESM r1i1p1f1 is missing the variables {'rlut'}
Missing data: ssp126 MIROC-ES2L r10i1p1f2 is missing the variables {'rsut', 'rsdt'}
Missing data: ssp126 MIROC-ES2L r4i1p1f2 is missing the variables {'rsut', 'rsdt'}
Missing data: ssp126 MIROC-ES2L r5i1p1f2 is missing the variables {'rsut', 'rlut', 'rsdt'}
Miss