# Parse CMIP6 models 
Identify models that contain all variables of interest and meet the required spatiotemporal resolution.

In [None]:
from pyesgf.search import SearchConnection
import os
from glob import glob
import json
import pandas as pd
import collections
import numpy as np

os.environ["ESGF_PYCLIENT_NO_FACETS_STAR_WARNING"] = '1'

In [None]:
CONN = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True)

In [None]:
ctx = CONN.new_context(
        project='CMIP6',
        experiment_id='historical',
        variable='tas',
        nominal_resolution='100 km',
        frequency='day',
        facets='project,experiment_family,experiment_id',
    )

## Function to interact with ESGF

In [None]:
def search_for_projects(var='hurs', ssp='ssp585'):
    ctx = CONN.new_context(
        project='CMIP6',
        experiment_id=ssp,
        variable=var,
        nominal_resolution='100 km',
        frequency='day',
        facets='project,experiment_family,experiment_id',
    )
    return list(ctx.search())


def clean_search(search):
    out = {}
    for s in search:
        files = s.file_context().search()
        if any(files):
            urls = [f.download_url for f in list(files)]
            out[s.dataset_id] = urls
    return out

## Download data for all variables of interest

In [None]:
for var in ['va', 'rsds', 'tas', 'tasmax', 'tasmin', 'hursmax', 'hurs', 'hursmin', 'pr', 'zg', 'ua', 'orog', 'uas', 'vas']:
    fp_out = f'./cmip6_meta_{var}_historical.json'
    
    if not os.path.exists(fp_out):
        print(fp_out)
        search = search_for_projects(var=var, ssp='historical')
        out = clean_search(search)

        with open(fp_out, 'w') as f:
            json.dump(out, f, indent=2, sort_keys=True)

## Identify Models that meet all criteria

In [None]:
#create empty dataframe
df = pd.DataFrame(columns =['model','frequency', 'resolution', 'start_date','end_date'])
#iterate over desired variables -- enter in search_for projects
for x in ['va', 'rsds', 'tas', 'tasmax', 'tasmin', 'hursmax', 'hurs', 'hursmin', 'pr', 'zg', 'ua', 'uas', 'vas']:
    var = x
    search = search_for_projects(var = var, ssp='ssp585')
    #access each json in search object
    for i in range(0,len(search)):
        d = search[i].json
        #pull desired info from each file
        model = d['id']
        if 'datetime_start' in d:
            start_date = d['datetime_start']
        else:
            start_date = np.NaN
        if 'datetime_stop' in d:
            end_date = d['datetime_stop']
        else:
            end_date = np.NaN
        freq = d['frequency']
        res = d['nominal_resolution']
        #add info to list and previously created df
        vals = [model, freq, res, start_date, end_date]
        df.loc[len(df)] = vals

#split model id variables to get model, scenario, variant, and variable
df[['model','scenario', 'variant','variable']] = df['model'].str.split('.' , expand = True).iloc[:,[3,4,5,7]]
#unpack nested cols
df = df.explode('resolution')
df = df.explode('frequency')

#view df -- each row is a variable/model combo
df.head()

In [None]:
#Ensure all start dates are beginning of 2015
df = df[(df.start_date <= '2015-01-01T12:00:00Z')]
#drop record if start date is missing
df = df.dropna(subset=['start_date'])

### Models that meet minimum variable requirements

In [None]:
models =[]
min_vars = ['rsds', 'tas', 'tasmax', 'tasmin', 'hurs', 'pr']

for i in df.model.unique():
    model_df = df[df.model == i]
    column_values = model_df['variable'].tolist()
    
    if all(value in column_values for value in min_vars):
        models.append(i)

models

### Models that contain all variables of interest

In [None]:
models_all =[]
all_vars = ['va', 'rsds', 'tas', 'tasmax', 'tasmin', 'hursmax', 'hurs', 'hursmin', 'pr', 'zg', 'ua']

for i in df.model.unique():
    model_df = df[df.model == i]
    column_values = model_df['variable'].tolist()
    
    if all(value in column_values for value in all_vars):
        models_all.append(i)

models_all
        

In [None]:
#create df that only includes final models
model_df= df[df['model'].isin(models)]

#create model-variant combo to filter variants
for i in model_df.index:
    model_df.loc[i, 'model_variant'] = model_df.loc[i, 'model'] + '-' + model_df.loc[i, 'variant']

model_df

## Retrieve download links for each model

In [None]:
variable_files = glob('./*.json')
model_links = pd.DataFrame()

sim = 'ssp585'
variant = 'r1i1p1f1'
node = 'esgf-data1.llnl.gov'

for m in models:
    out_fp = f"{m.lower().replace('-', '')}_{sim}_links.csv"
    for j in variable_files:
        with open(j) as f:
            file = json.load(f)
        for x in file:
            if (m in x) & (sim in x) & (variant in x) & (node in x):
                temp = pd.DataFrame(file[x])
                model_links = pd.concat([model_links, temp], axis = 0)
                
    model_links.to_csv(out_fp, index = False)
    print(f"Links written to {out_fp}.")