In [1]:
###############################################################################
# Load all of the libraries
###############################################################################
import pandas as pd

from stitches.pkgimports import *
import stitches.readpangeo as read


  from tqdm.autonotebook import tqdm


In [2]:
###############################################################################
# Let's take a look at what CMIP6 files are avaiable (it may take a few seconds to load)
###############################################################################

pangeo = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
pangeo


Unnamed: 0,unique
activity_id,17
institution_id,36
source_id,86
experiment_id,168
member_id,650
table_id,37
variable_id,709
grid_label,10
zstore,501398
dcpp_init_year,60


In [3]:
###############################################################################
# Get a first cut list of models for proof of concept.
# Only take models that have daily data; if they have daily, they almost
# certainly have monthly. Daily netcdfs are less commonly submitted, so doing
# the search on only daily will probably return fewer models for our proof of
# concept + then we # can validate on both monthly and daily data.
###############################################################################

# define experiments of interest
expts = ['ssp126', 'ssp245', 'ssp370', 'ssp585', 'ssp119', 'ssp434',
         'ssp460', 'ssp534-over', "historical"]

table_ids = ['day']

count_table = read.create_preliminary_model_list(pangeo,
                                                 experiments=expts,
                                                 table_ids=table_ids,
                                                 min_ensemble_size=5)

# Print the full list of models to process
print('Models to process')
print(count_table["source_id"].values)


Models to process
['CanESM5' 'IPSL-CM6A-LR' 'CNRM-ESM2-1' 'EC-Earth3' 'UKESM1-0-LL'
 'EC-Earth3-Veg' 'MIROC6' 'ACCESS-ESM1-5' 'CNRM-CM6-1' 'MPI-ESM1-2-LR'
 'MPI-ESM1-2-HR' 'NorCPM1']


In [4]:
###############################################################################
# Turn the list of experiments and the models in count_table into a query to
# get a pangeo subsetted list of tas files that we want to work with.
#
# available tas data is our first limiting step in terms of determining the
# models, experiments, and ensemble members we have available to create an
# archive for matching, Because our primary matching is performed on smooth
# tgav time series.
###############################################################################

# CMIP6 models don't submit annual netcdfs, so we will calculate
# tgav from monthly data.
query = dict(
    experiment_id=expts,
    variable_id='tas',
    source_id=count_table["source_id"].copy(),
    table_id='Amon',
    grid_label='gn'
)

# Subset the pangeo catalog with our query.
# This is the master list of files that we want to calculate Tgav across.
# Subset further to only keep the p1 physics setting from each model.
tas_pangeo_subset = pangeo.search(**query)

tas_pangeo_subset.df = read.keep_p1_results(tas_pangeo_subset).copy()

# rename the file column so that it has the variable id in it, and drop the
# variable_id column for easier joining below:
new_col_name = (tas_pangeo_subset.df['variable_id'].unique()+"_zstore")[0]
tas_pangeo_subset.df = tas_pangeo_subset.df.rename(columns={'zstore':new_col_name}).drop(columns='variable_id').copy()

# take a look
tas_pangeo_subset.df


Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,grid_label,tas_zstore,dcpp_init_year,version
0,CMIP,MIROC,MIROC6,historical,r8i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212
1,CMIP,MIROC,MIROC6,historical,r7i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212
2,CMIP,MIROC,MIROC6,historical,r1i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212
3,CMIP,MIROC,MIROC6,historical,r5i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212
4,CMIP,MIROC,MIROC6,historical,r3i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212
...,...,...,...,...,...,...,...,...,...,...
804,ScenarioMIP,MOHC,UKESM1-0-LL,ssp245,r5i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20201112
805,ScenarioMIP,MOHC,UKESM1-0-LL,ssp245,r12i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20201113
806,ScenarioMIP,MOHC,UKESM1-0-LL,ssp245,r19i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20201117
807,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r4i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20210205


In [5]:
###############################################################################
# We must then determine which variables we may want gridded data for at some
# point in our emulation. We must the join these to our tas pangeo_subset and
# save it as a metadata csv file.
#
# The csv file can then be used to filter archive data to only those
# experiment*ensemble members that have all the variables of interest for an
# experiment.
# Because this is metadata used for filtering and not any kind of data
# processing code, we go ahead and add a lot of variables we know we might
# care about at some point.

# First, define a function to do the query and joining to the df of tas meta-
# data.
###############################################################################
def query_and_join(cmip_var, time_var, join_to_df):
    query = dict(
        experiment_id=expts,
        variable_id=cmip_var,
        source_id=count_table["source_id"].copy(),
        table_id=time_var,
        grid_label='gn'
    )

    # Subset the pangeo catalog with our query.
    # This is the master list of files that we want to calculate Tgav across.
    # Subset further to only keep the p1 physics setting from each model.
    pangeo_subset = pangeo.search(**query)

    pangeo_subset.df = read.keep_p1_results(pangeo_subset).copy()

    # rename the file column so that it has the variable id in it, and drop the
    # variable_id column for easier joining below:
    new_col_name = (pangeo_subset.df['variable_id'].unique()+"_zstore")[0]
    pangeo_subset.df = pangeo_subset.df.rename(columns={'zstore':new_col_name}).drop(columns='variable_id').copy()

    # do a left join on the tas_pangeo_subset.df
    out = pd.merge(left = join_to_df,
                   right = pangeo_subset.df,
                   how = 'left')

    return(out)
# end function definition




In [6]:
###############################################################################
# use the function and iterate over new variables to add metadata for
###############################################################################
new_df = query_and_join(cmip_var='psl', time_var='Amon', join_to_df=tas_pangeo_subset.df)

new_df = query_and_join(cmip_var='pr', time_var='Amon', join_to_df=new_df)
new_df = query_and_join(cmip_var='tasmax', time_var='Amon', join_to_df=new_df)
new_df = query_and_join(cmip_var='tasmin', time_var='Amon', join_to_df=new_df)

# Take a look
new_df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,grid_label,tas_zstore,dcpp_init_year,version,psl_zstore,pr_zstore,tasmax_zstore,tasmin_zstore
0,CMIP,MIROC,MIROC6,historical,r8i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...
1,CMIP,MIROC,MIROC6,historical,r7i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...
2,CMIP,MIROC,MIROC6,historical,r1i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...
3,CMIP,MIROC,MIROC6,historical,r5i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...
4,CMIP,MIROC,MIROC6,historical,r3i1p1f1,Amon,gn,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,,20181212,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...,gs://cmip6/CMIP6/CMIP/MIROC/MIROC6/historical/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
639,ScenarioMIP,MOHC,UKESM1-0-LL,ssp245,r5i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20201112,,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,
640,ScenarioMIP,MOHC,UKESM1-0-LL,ssp245,r12i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20201113,,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,
641,ScenarioMIP,MOHC,UKESM1-0-LL,ssp245,r19i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20201117,,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,
642,ScenarioMIP,MOHC,UKESM1-0-LL,ssp585,r4i1p1f2,Amon,gn,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,20210205,,gs://cmip6/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/...,,


In [7]:
###############################################################################
# Save it off; using this csv file, we can then filter our archive of smoothed
# tgav values available for matching to only those experiment*ensemble members
# that have the particular variables of interest to us as gridded data.
###############################################################################

# relative pathnames don't work when just running a python script.
# Create a string with the local location of the stitches directory
local_location = "/Users/snyd535/Documents/task11a-topdown-clim-ML/stitches"
# local_location = "/Users/dorh012/Documents/2021/stitches"

new_df.to_csv(local_location+"/notebooks/stitches_dev/inputs/pangeo_path_metadata_tas_psl_pr_tmax_tmin.csv", index=False)
