# Search ESGF, using esgf-pyclient

In [None]:
from pyesgf.search import SearchConnection
import json
import pandas as pd
#import numpy as np
#conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True)
conn = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True) 
# choose True to search all nodes(?)

# can we be sure to get all models and members when 
# searching only https://esgf-node.llnl.gov/esg-search ?


In [None]:
variable_list = ['tas','rlut','rsut','rsdt']
experiment_list = ['piControl', 'abrupt-4xCO2', '1pctCO2',\
                   'abrupt-2xCO2', 'abrupt-0p5xCO2',\
                   'historical', 'hist-GHG', 'hist-nat', 'hist-aer',\
                  'ssp119', 'ssp126', 'ssp245', 'ssp370', 'ssp585',\
                  'piClim-control', 'piClim-4xCO2', 'piClim-histall']

# functions:
def context_to_df(context):
    column_names = ['exp', 'source_id', 'member_id', 'variable', 'version', 'node']
    df = pd.DataFrame(columns = column_names)
    for i in range(context.hit_count): # this may take a while
        if i%50 == 0 and i>0:
            print(i)
        result = context.search()[i].dataset_id # this is the time consuming part
        info_split = result.rsplit("|")
        node = info_split[-1]
        id_str = info_split[0].rsplit(".")
        source_id = id_str[3]; exp = id_str[4]; member = id_str[5];
        var = id_str[7]; version = id_str[9];
        df_row = pd.DataFrame([[exp, source_id, member, var, version, node]], columns = column_names)
        df = pd.concat((df, df_row), ignore_index = True)
    return df

In [None]:
# find first the models that have the variable 'tas' for both piControl and abrupt-4xCO2
exp_dict = {}
for experiment in ['piControl', 'abrupt-4xCO2']:
    ctx = conn.new_context(project='CMIP6', table_id = 'Amon',\
                           latest=True, replica=True, # need to search replicas at least for E3SM-1-0
                           experiment_id=experiment, variable='tas')
    print('Found', ctx.hit_count, 'results of "tas" for experiment', experiment)
    df = context_to_df(ctx)
    models = df['source_id'].unique()
    # save as json file
    exp_dict[experiment] = list(models)
    with open('../Processed_data/Data_availability/esgf-'+ experiment+'_models.json', 'w') as f:
        json.dump(exp_dict, f, indent=2)


In [None]:
with open('../Processed_data/Data_availability/ESGF/esgf-piControl_models.json', 'r') as f:
        piControl_models  = json.load(f)
with open('../Processed_data/Data_availability/ESGF/esgf-abrupt-4xCO2_models.json', 'r') as f:
        abrupt4xCO2_models = json.load(f)
models = list(set.intersection(set(piControl_models['piControl']), set(abrupt4xCO2_models['abrupt-4xCO2'])))
models.sort()

#for experiment in [experiment_list[14]]:
for experiment in experiment_list[14:]:
    exp_dict = {}
    for model in models:
        exp_dict[model] = {}
        # make new search for each model, hoping that 
        # smaller searches will be less time-consuming.
        # check only original records initially (no replicas).
        replica_search = False
        ctx = conn.new_context(project='CMIP6', table_id = 'Amon',\
                           latest=True, replica=False, source_id = model,\
                           experiment_id=experiment, variable=variable_list)
        print('Found', ctx.hit_count, 'results for', model, experiment)
        if ctx.hit_count == 0:
            # does this happen just because the website is unstable??
            # because it happens only sometimes...
            
            print('Searching in replicas instead:')
            ctx = conn.new_context(project='CMIP6', table_id = 'Amon',\
                           latest=True, replica=True, source_id = model,\
                           experiment_id=experiment, variable=variable_list)    
            print('Found', ctx.hit_count, 'replica results for', model, experiment)
            replica_search = True
        model_df = context_to_df(ctx)
        
        members = list(model_df['member_id'].unique())
        members.sort()
        for member in members: # check that each member has all 4 variables
            member_df = model_df[model_df['member_id'] == member]
            member_vars = model_df['variable'].unique()
            if len(member_vars) < len(variable_list):
                print(experiment, model, member, 'has only the variables', member_vars)
                # check for other variables in replicas (unless we have done so already):
                if replica_search == False:
                    ctx_member = conn.new_context(project='CMIP6', table_id = 'Amon',\
                           latest=True, source_id = model, variant_label = member,\
                           experiment_id=experiment, variable=variable_list)
                member_df = context_to_df(ctx_member)
                member_vars = member_df['variable'].unique()
                if len(member_vars) < len(variable_list):
                    print('found no replicas containing all variables')
                    #members.remove(member)
                else:
                    print('all variables exist in replicas')
                    latest_version = member_df['version'].unique()
                    exp_dict[model][member] = list(latest_version)
            else:
                latest_version = member_df['version'].unique()
                exp_dict[model][member] = list(latest_version)
        #exp_dict[model] = members
        
        
    with open('../Processed_data/Data_availability/ESGF/esgf-'+ experiment+'.json', 'w') as f:
        json.dump(exp_dict, f, indent=2)
        
        

In [None]:
exp_dict